howard.objects.variants

    1import csv
    2import gc
    3import gzip
    4import io
    5import multiprocessing
    6import os
    7import random
    8import re
    9import shlex
   10import sqlite3
   11import subprocess
   12from tempfile import NamedTemporaryFile, TemporaryDirectory
   13import tempfile
   14import duckdb
   15import json
   16import yaml
   17import argparse
   18import Bio.bgzf as bgzf
   19import pandas as pd
   20from pyfaidx import Fasta
   21import numpy as np
   22import vcf
   23import logging as log
   24import fastparquet as fp
   25from multiprocesspandas import applyparallel
   26
   27from howard.functions.commons import *
   28from howard.objects.database import *
   29from howard.functions.databases import *
   30from howard.functions.utils import *
   31
   32
   33class Variants:
   34
   35    def __init__(
   36        self,
   37        conn=None,
   38        input: str = None,
   39        output: str = None,
   40        config: dict = {},
   41        param: dict = {},
   42        load: bool = False,
   43    ) -> None:
   44        """
   45        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   46        header
   47
   48        :param conn: the connection to the database
   49        :param input: the input file
   50        :param output: the output file
   51        :param config: a dictionary containing the configuration of the model
   52        :param param: a dictionary containing the parameters of the model
   53        """
   54
   55        # Init variables
   56        self.init_variables()
   57
   58        # Input
   59        self.set_input(input)
   60
   61        # Config
   62        self.set_config(config)
   63
   64        # Param
   65        self.set_param(param)
   66
   67        # Output
   68        self.set_output(output)
   69
   70        # connexion
   71        self.set_connexion(conn)
   72
   73        # Header
   74        self.set_header()
   75
   76        # Load data
   77        if load:
   78            self.load_data()
   79
   80    def set_input(self, input: str = None) -> None:
   81        """
   82        The function `set_input` takes a file name as input, extracts the name and extension, and sets
   83        attributes in the class accordingly.
   84
   85        :param input: The `set_input` method in the provided code snippet is used to set attributes
   86        related to the input file. Here's a breakdown of the parameters and their usage in the method:
   87        :type input: str
   88        """
   89
   90        if input and not isinstance(input, str):
   91            try:
   92                self.input = input.name
   93            except:
   94                log.error(f"Input file '{input} in bad format")
   95                raise ValueError(f"Input file '{input} in bad format")
   96        else:
   97            self.input = input
   98
   99        # Input format
  100        if input:
  101            input_name, input_extension = os.path.splitext(self.input)
  102            self.input_name = input_name
  103            self.input_extension = input_extension
  104            self.input_format = self.input_extension.replace(".", "")
  105
  106    def set_config(self, config: dict) -> None:
  107        """
  108        The set_config function takes a config object and assigns it as the configuration object for the
  109        class.
  110
  111        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  112        contains configuration settings for the class. When you call the `set_config` function with a
  113        dictionary object as the argument, it will set that dictionary as the configuration object for
  114        the class
  115        :type config: dict
  116        """
  117
  118        self.config = config
  119
  120    def set_param(self, param: dict) -> None:
  121        """
  122        This function sets a parameter object for the class based on the input dictionary.
  123
  124        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  125        as the `param` attribute of the class instance
  126        :type param: dict
  127        """
  128
  129        self.param = param
  130
  131    def init_variables(self) -> None:
  132        """
  133        This function initializes the variables that will be used in the rest of the class
  134        """
  135
  136        self.prefix = "howard"
  137        self.table_variants = "variants"
  138        self.dataframe = None
  139
  140        self.comparison_map = {
  141            "gt": ">",
  142            "gte": ">=",
  143            "lt": "<",
  144            "lte": "<=",
  145            "equals": "=",
  146            "contains": "SIMILAR TO",
  147        }
  148
  149        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  150
  151        self.code_type_map_to_sql = {
  152            "Integer": "INTEGER",
  153            "String": "VARCHAR",
  154            "Float": "FLOAT",
  155            "Flag": "VARCHAR",
  156        }
  157
  158        self.index_additionnal_fields = []
  159
  160    def get_indexing(self) -> bool:
  161        """
  162        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  163        returns False.
  164        :return: The value of the indexing parameter.
  165        """
  166
  167        return self.get_param().get("indexing", False)
  168
  169    def get_connexion_config(self) -> dict:
  170        """
  171        The function `get_connexion_config` returns a dictionary containing the configuration for a
  172        connection, including the number of threads and memory limit.
  173        :return: a dictionary containing the configuration for the Connexion library.
  174        """
  175
  176        # config
  177        config = self.get_config()
  178
  179        # Connexion config
  180        connexion_config = {}
  181        threads = self.get_threads()
  182
  183        # Threads
  184        if threads:
  185            connexion_config["threads"] = threads
  186
  187        # Memory
  188        # if config.get("memory", None):
  189        #     connexion_config["memory_limit"] = config.get("memory")
  190        if self.get_memory():
  191            connexion_config["memory_limit"] = self.get_memory()
  192
  193        # Temporary directory
  194        if config.get("tmp", None):
  195            connexion_config["temp_directory"] = config.get("tmp")
  196
  197        # Access
  198        if config.get("access", None):
  199            access = config.get("access")
  200            if access in ["RO"]:
  201                access = "READ_ONLY"
  202            elif access in ["RW"]:
  203                access = "READ_WRITE"
  204            connexion_db = self.get_connexion_db()
  205            if connexion_db in ":memory:":
  206                access = "READ_WRITE"
  207            connexion_config["access_mode"] = access
  208
  209        return connexion_config
  210
  211    def get_duckdb_settings(self) -> dict:
  212        """
  213        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  214        string.
  215        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  216        """
  217
  218        # config
  219        config = self.get_config()
  220
  221        # duckdb settings
  222        duckdb_settings_dict = {}
  223        if config.get("duckdb_settings", None):
  224            duckdb_settings = config.get("duckdb_settings")
  225            duckdb_settings = full_path(duckdb_settings)
  226            # duckdb setting is a file
  227            if os.path.exists(duckdb_settings):
  228                with open(duckdb_settings) as json_file:
  229                    duckdb_settings_dict = yaml.safe_load(json_file)
  230            # duckdb settings is a string
  231            else:
  232                duckdb_settings_dict = json.loads(duckdb_settings)
  233
  234        return duckdb_settings_dict
  235
  236    def set_connexion_db(self) -> str:
  237        """
  238        The function `set_connexion_db` returns the appropriate database connection string based on the
  239        input format and connection type.
  240        :return: the value of the variable `connexion_db`.
  241        """
  242
  243        # Default connexion db
  244        default_connexion_db = ":memory:"
  245
  246        # Find connexion db
  247        if self.get_input_format() in ["db", "duckdb"]:
  248            connexion_db = self.get_input()
  249        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  250            connexion_db = default_connexion_db
  251        elif self.get_connexion_type() in ["tmpfile"]:
  252            tmp_name = tempfile.mkdtemp(
  253                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  254            )
  255            connexion_db = f"{tmp_name}/tmp.db"
  256        elif self.get_connexion_type() != "":
  257            connexion_db = self.get_connexion_type()
  258        else:
  259            connexion_db = default_connexion_db
  260
  261        # Set connexion db
  262        self.connexion_db = connexion_db
  263
  264        return connexion_db
  265
  266    def set_connexion(self, conn) -> None:
  267        """
  268        The function `set_connexion` creates a connection to a database, with options for different
  269        database formats and settings.
  270
  271        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  272        database. If a connection is not provided, a new connection to an in-memory database is created.
  273        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  274        sqlite
  275        """
  276
  277        # Connexion db
  278        connexion_db = self.set_connexion_db()
  279
  280        # Connexion config
  281        connexion_config = self.get_connexion_config()
  282
  283        # Connexion format
  284        connexion_format = self.get_config().get("connexion_format", "duckdb")
  285        # Set connexion format
  286        self.connexion_format = connexion_format
  287
  288        # Connexion
  289        if not conn:
  290            if connexion_format in ["duckdb"]:
  291                conn = duckdb.connect(connexion_db, config=connexion_config)
  292                # duckDB settings
  293                duckdb_settings = self.get_duckdb_settings()
  294                if duckdb_settings:
  295                    for setting in duckdb_settings:
  296                        setting_value = duckdb_settings.get(setting)
  297                        if isinstance(setting_value, str):
  298                            setting_value = f"'{setting_value}'"
  299                        conn.execute(f"PRAGMA {setting}={setting_value};")
  300            elif connexion_format in ["sqlite"]:
  301                conn = sqlite3.connect(connexion_db)
  302
  303        # Set connexion
  304        self.conn = conn
  305
  306        # Log
  307        log.debug(f"connexion_format: {connexion_format}")
  308        log.debug(f"connexion_db: {connexion_db}")
  309        log.debug(f"connexion config: {connexion_config}")
  310        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  311
  312    def set_output(self, output: str = None) -> None:
  313        """
  314        The `set_output` function in Python sets the output file based on the input or a specified key
  315        in the config file, extracting the output name, extension, and format.
  316
  317        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  318        the output file. If the config file has an 'output' key, the method sets the output to the value
  319        of that key. If no output is provided, it sets the output to `None`
  320        :type output: str
  321        """
  322
  323        if output and not isinstance(output, str):
  324            self.output = output.name
  325        else:
  326            self.output = output
  327
  328        # Output format
  329        if self.output:
  330            output_name, output_extension = os.path.splitext(self.output)
  331            self.output_name = output_name
  332            self.output_extension = output_extension
  333            self.output_format = self.output_extension.replace(".", "")
  334        else:
  335            self.output_name = None
  336            self.output_extension = None
  337            self.output_format = None
  338
  339    def set_header(self) -> None:
  340        """
  341        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  342        """
  343
  344        input_file = self.get_input()
  345        default_header_list = [
  346            "##fileformat=VCFv4.2",
  347            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  348        ]
  349
  350        # Full path
  351        input_file = full_path(input_file)
  352
  353        if input_file:
  354
  355            input_format = self.get_input_format()
  356            input_compressed = self.get_input_compressed()
  357            config = self.get_config()
  358            header_list = default_header_list
  359            if input_format in [
  360                "vcf",
  361                "hdr",
  362                "tsv",
  363                "csv",
  364                "psv",
  365                "parquet",
  366                "db",
  367                "duckdb",
  368            ]:
  369                # header provided in param
  370                if config.get("header_file", None):
  371                    with open(config.get("header_file"), "rt") as f:
  372                        header_list = self.read_vcf_header(f)
  373                # within a vcf file format (header within input file itsself)
  374                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  375                    # within a compressed vcf file format (.vcf.gz)
  376                    if input_compressed:
  377                        with bgzf.open(input_file, "rt") as f:
  378                            header_list = self.read_vcf_header(f)
  379                    # within an uncompressed vcf file format (.vcf)
  380                    else:
  381                        with open(input_file, "rt") as f:
  382                            header_list = self.read_vcf_header(f)
  383                # header provided in default external file .hdr
  384                elif os.path.exists((input_file + ".hdr")):
  385                    with open(input_file + ".hdr", "rt") as f:
  386                        header_list = self.read_vcf_header(f)
  387                else:
  388                    try:  # Try to get header info fields and file columns
  389
  390                        with tempfile.TemporaryDirectory() as tmpdir:
  391
  392                            # Create database
  393                            db_for_header = Database(database=input_file)
  394
  395                            # Get header columns for infos fields
  396                            db_header_from_columns = (
  397                                db_for_header.get_header_from_columns()
  398                            )
  399
  400                            # Get real columns in the file
  401                            db_header_columns = db_for_header.get_columns()
  402
  403                            # Write header file
  404                            header_file_tmp = os.path.join(tmpdir, "header")
  405                            f = open(header_file_tmp, "w")
  406                            vcf.Writer(f, db_header_from_columns)
  407                            f.close()
  408
  409                            # Replace #CHROM line with rel columns
  410                            header_list = db_for_header.read_header_file(
  411                                header_file=header_file_tmp
  412                            )
  413                            header_list[-1] = "\t".join(db_header_columns)
  414
  415                    except:
  416
  417                        log.warning(
  418                            f"No header for file {input_file}. Set as default VCF header"
  419                        )
  420                        header_list = default_header_list
  421
  422            else:  # try for unknown format ?
  423
  424                log.error(f"Input file format '{input_format}' not available")
  425                raise ValueError(f"Input file format '{input_format}' not available")
  426
  427            if not header_list:
  428                header_list = default_header_list
  429
  430            # header as list
  431            self.header_list = header_list
  432
  433            # header as VCF object
  434            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  435
  436        else:
  437
  438            self.header_list = None
  439            self.header_vcf = None
  440
  441    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  442        """
  443        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  444        DataFrame based on the connection format.
  445
  446        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  447        represents the SQL query you want to execute. This query will be used to fetch data from a
  448        database and convert it into a pandas DataFrame
  449        :type query: str
  450        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  451        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  452        function will only fetch up to that number of rows from the database query result. If no limit
  453        is specified,
  454        :type limit: int
  455        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  456        """
  457
  458        # Connexion format
  459        connexion_format = self.get_connexion_format()
  460
  461        # Limit in query
  462        if limit:
  463            pd.set_option("display.max_rows", limit)
  464            if connexion_format in ["duckdb"]:
  465                df = (
  466                    self.conn.execute(query)
  467                    .fetch_record_batch(limit)
  468                    .read_next_batch()
  469                    .to_pandas()
  470                )
  471            elif connexion_format in ["sqlite"]:
  472                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  473
  474        # Full query
  475        else:
  476            if connexion_format in ["duckdb"]:
  477                df = self.conn.execute(query).df()
  478            elif connexion_format in ["sqlite"]:
  479                df = pd.read_sql_query(query, self.conn)
  480
  481        return df
  482
  483    def get_overview(self) -> None:
  484        """
  485        The function prints the input, output, config, and dataframe of the current object
  486        """
  487        table_variants_from = self.get_table_variants(clause="from")
  488        sql_columns = self.get_header_columns_as_sql()
  489        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  490        df = self.get_query_to_df(sql_query_export)
  491        log.info(
  492            "Input:  "
  493            + str(self.get_input())
  494            + " ["
  495            + str(str(self.get_input_format()))
  496            + "]"
  497        )
  498        log.info(
  499            "Output: "
  500            + str(self.get_output())
  501            + " ["
  502            + str(str(self.get_output_format()))
  503            + "]"
  504        )
  505        log.info("Config: ")
  506        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  507            "\n"
  508        ):
  509            log.info("\t" + str(d))
  510        log.info("Param: ")
  511        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  512            "\n"
  513        ):
  514            log.info("\t" + str(d))
  515        log.info("Sample list: " + str(self.get_header_sample_list()))
  516        log.info("Dataframe: ")
  517        for d in str(df).split("\n"):
  518            log.info("\t" + str(d))
  519
  520        # garbage collector
  521        del df
  522        gc.collect()
  523
  524        return None
  525
  526    def get_stats(self) -> dict:
  527        """
  528        The `get_stats` function calculates and returns various statistics of the current object,
  529        including information about the input file, variants, samples, header fields, quality, and
  530        SNVs/InDels.
  531        :return: a dictionary containing various statistics of the current object. The dictionary has
  532        the following structure:
  533        """
  534
  535        # Log
  536        log.info(f"Stats Calculation...")
  537
  538        # table varaints
  539        table_variants_from = self.get_table_variants()
  540
  541        # stats dict
  542        stats = {"Infos": {}}
  543
  544        ### File
  545        input_file = self.get_input()
  546        stats["Infos"]["Input file"] = input_file
  547
  548        # Header
  549        header_infos = self.get_header().infos
  550        header_formats = self.get_header().formats
  551        header_infos_list = list(header_infos)
  552        header_formats_list = list(header_formats)
  553
  554        ### Variants
  555
  556        stats["Variants"] = {}
  557
  558        # Variants by chr
  559        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  560        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  561        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  562            by=["CHROM"], kind="quicksort"
  563        )
  564
  565        # Total number of variants
  566        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  567
  568        # Calculate percentage
  569        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  570            lambda x: (x / nb_of_variants)
  571        )
  572
  573        stats["Variants"]["Number of variants by chromosome"] = (
  574            nb_of_variants_by_chrom.to_dict(orient="index")
  575        )
  576
  577        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  578
  579        ### Samples
  580
  581        # Init
  582        samples = {}
  583        nb_of_samples = 0
  584
  585        # Check Samples
  586        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  587            log.debug(f"Check samples...")
  588            for sample in self.get_header_sample_list():
  589                sql_query_samples = f"""
  590                    SELECT  '{sample}' as sample,
  591                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  592                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  593                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  594                    FROM {table_variants_from}
  595                    WHERE (
  596                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  597                        AND
  598                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  599                      )
  600                    GROUP BY genotype
  601                    """
  602                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  603                sample_genotype_count = sql_query_genotype_df["count"].sum()
  604                if len(sql_query_genotype_df):
  605                    nb_of_samples += 1
  606                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  607                        sql_query_genotype_df.to_dict(orient="index")
  608                    )
  609
  610            stats["Samples"] = samples
  611            stats["Infos"]["Number of samples"] = nb_of_samples
  612
  613        # #
  614        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  615        #     stats["Infos"]["Number of samples"] = nb_of_samples
  616        # elif nb_of_samples:
  617        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  618
  619        ### INFO and FORMAT fields
  620        header_types_df = {}
  621        header_types_list = {
  622            "List of INFO fields": header_infos,
  623            "List of FORMAT fields": header_formats,
  624        }
  625        i = 0
  626        for header_type in header_types_list:
  627
  628            header_type_infos = header_types_list.get(header_type)
  629            header_infos_dict = {}
  630
  631            for info in header_type_infos:
  632
  633                i += 1
  634                header_infos_dict[i] = {}
  635
  636                # ID
  637                header_infos_dict[i]["id"] = info
  638
  639                # num
  640                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  641                if header_type_infos[info].num in genotype_map.keys():
  642                    header_infos_dict[i]["Number"] = genotype_map.get(
  643                        header_type_infos[info].num
  644                    )
  645                else:
  646                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  647
  648                # type
  649                if header_type_infos[info].type:
  650                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  651                else:
  652                    header_infos_dict[i]["Type"] = "."
  653
  654                # desc
  655                if header_type_infos[info].desc != None:
  656                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  657                else:
  658                    header_infos_dict[i]["Description"] = ""
  659
  660            if len(header_infos_dict):
  661                header_types_df[header_type] = pd.DataFrame.from_dict(
  662                    header_infos_dict, orient="index"
  663                ).to_dict(orient="index")
  664
  665        # Stats
  666        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  667        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  668        stats["Header"] = header_types_df
  669
  670        ### QUAL
  671        if "QUAL" in self.get_header_columns():
  672            sql_query_qual = f"""
  673                    SELECT
  674                        avg(CAST(QUAL AS INTEGER)) AS Average,
  675                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  676                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  677                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  678                        median(CAST(QUAL AS INTEGER)) AS Median,
  679                        variance(CAST(QUAL AS INTEGER)) AS Variance
  680                    FROM {table_variants_from}
  681                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  682                    """
  683
  684            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  685            stats["Quality"] = {"Stats": qual}
  686
  687        ### SNV and InDel
  688
  689        sql_query_snv = f"""
  690            
  691            SELECT Type, count FROM (
  692
  693                    SELECT
  694                        'Total' AS Type,
  695                        count(*) AS count
  696                    FROM {table_variants_from}
  697
  698                    UNION
  699
  700                    SELECT
  701                        'MNV' AS Type,
  702                        count(*) AS count
  703                    FROM {table_variants_from}
  704                    WHERE len(REF) > 1 AND len(ALT) > 1
  705                    AND len(REF) = len(ALT)
  706
  707                    UNION
  708
  709                    SELECT
  710                        'InDel' AS Type,
  711                        count(*) AS count
  712                    FROM {table_variants_from}
  713                    WHERE len(REF) > 1 OR len(ALT) > 1
  714                    AND len(REF) != len(ALT)
  715                    
  716                    UNION
  717
  718                    SELECT
  719                        'SNV' AS Type,
  720                        count(*) AS count
  721                    FROM {table_variants_from}
  722                    WHERE len(REF) = 1 AND len(ALT) = 1
  723
  724                )
  725
  726            ORDER BY count DESC
  727
  728                """
  729        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  730
  731        sql_query_snv_substitution = f"""
  732                SELECT
  733                    concat(REF, '>', ALT) AS 'Substitution',
  734                    count(*) AS count
  735                FROM {table_variants_from}
  736                WHERE len(REF) = 1 AND len(ALT) = 1
  737                GROUP BY REF, ALT
  738                ORDER BY count(*) DESC
  739                """
  740        snv_substitution = (
  741            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  742        )
  743        stats["Variants"]["Counts"] = snv_indel
  744        stats["Variants"]["Substitutions"] = snv_substitution
  745
  746        return stats
  747
  748    def stats_to_file(self, file: str = None) -> str:
  749        """
  750        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  751        into a JSON object, and writes the JSON object to the specified file.
  752
  753        :param file: The `file` parameter is a string that represents the file path where the JSON data
  754        will be written
  755        :type file: str
  756        :return: the name of the file that was written to.
  757        """
  758
  759        # Get stats
  760        stats = self.get_stats()
  761
  762        # Serializing json
  763        json_object = json.dumps(stats, indent=4)
  764
  765        # Writing to sample.json
  766        with open(file, "w") as outfile:
  767            outfile.write(json_object)
  768
  769        return file
  770
  771    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  772        """
  773        The `print_stats` function generates a markdown file and prints the statistics contained in a
  774        JSON file in a formatted manner.
  775
  776        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  777        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  778        provided, a temporary directory will be created and the stats will be saved in a file named
  779        "stats.md" within that
  780        :type output_file: str
  781        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  782        file where the statistics will be saved. If no value is provided, a temporary directory will be
  783        created and a default file name "stats.json" will be used
  784        :type json_file: str
  785        :return: The function `print_stats` does not return any value. It has a return type annotation
  786        of `None`.
  787        """
  788
  789        # Full path
  790        output_file = full_path(output_file)
  791        json_file = full_path(json_file)
  792
  793        with tempfile.TemporaryDirectory() as tmpdir:
  794
  795            # Files
  796            if not output_file:
  797                output_file = os.path.join(tmpdir, "stats.md")
  798            if not json_file:
  799                json_file = os.path.join(tmpdir, "stats.json")
  800
  801            # Create folders
  802            if not os.path.exists(os.path.dirname(output_file)):
  803                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  804            if not os.path.exists(os.path.dirname(json_file)):
  805                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  806
  807            # Create stats JSON file
  808            stats_file = self.stats_to_file(file=json_file)
  809
  810            # Print stats file
  811            with open(stats_file) as f:
  812                stats = yaml.safe_load(f)
  813
  814            # Output
  815            output_title = []
  816            output_index = []
  817            output = []
  818
  819            # Title
  820            output_title.append("# HOWARD Stats")
  821
  822            # Index
  823            output_index.append("## Index")
  824
  825            # Process sections
  826            for section in stats:
  827                infos = stats.get(section)
  828                section_link = "#" + section.lower().replace(" ", "-")
  829                output.append(f"## {section}")
  830                output_index.append(f"- [{section}]({section_link})")
  831
  832                if len(infos):
  833                    for info in infos:
  834                        try:
  835                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  836                            is_df = True
  837                        except:
  838                            try:
  839                                df = pd.DataFrame.from_dict(
  840                                    json.loads((infos.get(info))), orient="index"
  841                                )
  842                                is_df = True
  843                            except:
  844                                is_df = False
  845                        if is_df:
  846                            output.append(f"### {info}")
  847                            info_link = "#" + info.lower().replace(" ", "-")
  848                            output_index.append(f"   - [{info}]({info_link})")
  849                            output.append(f"{df.to_markdown(index=False)}")
  850                        else:
  851                            output.append(f"- {info}: {infos.get(info)}")
  852                else:
  853                    output.append(f"NA")
  854
  855            # Write stats in markdown file
  856            with open(output_file, "w") as fp:
  857                for item in output_title:
  858                    fp.write("%s\n" % item)
  859                for item in output_index:
  860                    fp.write("%s\n" % item)
  861                for item in output:
  862                    fp.write("%s\n" % item)
  863
  864            # Output stats in markdown
  865            print("")
  866            print("\n\n".join(output_title))
  867            print("")
  868            print("\n\n".join(output))
  869            print("")
  870
  871        return None
  872
  873    def get_input(self) -> str:
  874        """
  875        It returns the value of the input variable.
  876        :return: The input is being returned.
  877        """
  878        return self.input
  879
  880    def get_input_format(self, input_file: str = None) -> str:
  881        """
  882        This function returns the format of the input variable, either from the provided input file or
  883        by prompting for input.
  884
  885        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  886        represents the file path of the input file. If no `input_file` is provided when calling the
  887        method, it will default to `None`
  888        :type input_file: str
  889        :return: The format of the input variable is being returned.
  890        """
  891
  892        if not input_file:
  893            input_file = self.get_input()
  894        input_format = get_file_format(input_file)
  895        return input_format
  896
  897    def get_input_compressed(self, input_file: str = None) -> str:
  898        """
  899        The function `get_input_compressed` returns the format of the input variable after compressing
  900        it.
  901
  902        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  903        that represents the file path of the input file. If no `input_file` is provided when calling the
  904        method, it will default to `None` and the method will then call `self.get_input()` to
  905        :type input_file: str
  906        :return: The function `get_input_compressed` returns the compressed format of the input
  907        variable.
  908        """
  909
  910        if not input_file:
  911            input_file = self.get_input()
  912        input_compressed = get_file_compressed(input_file)
  913        return input_compressed
  914
  915    def get_output(self) -> str:
  916        """
  917        It returns the output of the neuron.
  918        :return: The output of the neural network.
  919        """
  920
  921        return self.output
  922
  923    def get_output_format(self, output_file: str = None) -> str:
  924        """
  925        The function `get_output_format` returns the format of the input variable or the output file if
  926        provided.
  927
  928        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  929        that represents the file path of the output file. If no `output_file` is provided when calling
  930        the method, it will default to the output obtained from the `get_output` method of the class
  931        instance. The
  932        :type output_file: str
  933        :return: The format of the input variable is being returned.
  934        """
  935
  936        if not output_file:
  937            output_file = self.get_output()
  938        output_format = get_file_format(output_file)
  939
  940        return output_format
  941
  942    def get_config(self) -> dict:
  943        """
  944        It returns the config
  945        :return: The config variable is being returned.
  946        """
  947        return self.config
  948
  949    def get_param(self) -> dict:
  950        """
  951        It returns the param
  952        :return: The param variable is being returned.
  953        """
  954        return self.param
  955
  956    def get_connexion_db(self) -> str:
  957        """
  958        It returns the connexion_db attribute of the object
  959        :return: The connexion_db is being returned.
  960        """
  961        return self.connexion_db
  962
  963    def get_prefix(self) -> str:
  964        """
  965        It returns the prefix of the object.
  966        :return: The prefix is being returned.
  967        """
  968        return self.prefix
  969
  970    def get_table_variants(self, clause: str = "select") -> str:
  971        """
  972        This function returns the table_variants attribute of the object
  973
  974        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
  975        defaults to select (optional)
  976        :return: The table_variants attribute of the object.
  977        """
  978
  979        # Access
  980        access = self.get_config().get("access", None)
  981
  982        # Clauses "select", "where", "update"
  983        if clause in ["select", "where", "update"]:
  984            table_variants = self.table_variants
  985        # Clause "from"
  986        elif clause in ["from"]:
  987            # For Read Only
  988            if self.get_input_format() in ["parquet"] and access in ["RO"]:
  989                input_file = self.get_input()
  990                table_variants = f"'{input_file}' as variants"
  991            # For Read Write
  992            else:
  993                table_variants = f"{self.table_variants} as variants"
  994        else:
  995            table_variants = self.table_variants
  996        return table_variants
  997
  998    def get_tmp_dir(self) -> str:
  999        """
 1000        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1001        parameters or a default path.
 1002        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1003        configuration, parameters, and a default value of "/tmp".
 1004        """
 1005
 1006        return get_tmp(
 1007            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1008        )
 1009
 1010    def get_connexion_type(self) -> str:
 1011        """
 1012        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1013
 1014        :return: The connexion type is being returned.
 1015        """
 1016        return self.get_config().get("connexion_type", "memory")
 1017
 1018    def get_connexion(self):
 1019        """
 1020        It returns the connection object
 1021
 1022        :return: The connection object.
 1023        """
 1024        return self.conn
 1025
 1026    def close_connexion(self) -> None:
 1027        """
 1028        This function closes the connection to the database.
 1029        :return: The connection is being closed.
 1030        """
 1031        return self.conn.close()
 1032
 1033    def get_header(self, type: str = "vcf"):
 1034        """
 1035        This function returns the header of the VCF file as a list of strings
 1036
 1037        :param type: the type of header you want to get, defaults to vcf (optional)
 1038        :return: The header of the vcf file.
 1039        """
 1040
 1041        if self.header_vcf:
 1042            if type == "vcf":
 1043                return self.header_vcf
 1044            elif type == "list":
 1045                return self.header_list
 1046        else:
 1047            if type == "vcf":
 1048                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1049                return header
 1050            elif type == "list":
 1051                return vcf_required
 1052
 1053    def get_header_length(self, file: str = None) -> int:
 1054        """
 1055        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1056        line.
 1057
 1058        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1059        header file. If this argument is provided, the function will read the header from the specified
 1060        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1061        :type file: str
 1062        :return: the length of the header list, excluding the #CHROM line.
 1063        """
 1064
 1065        if file:
 1066            return len(self.read_vcf_header_file(file=file)) - 1
 1067        elif self.get_header(type="list"):
 1068            return len(self.get_header(type="list")) - 1
 1069        else:
 1070            return 0
 1071
 1072    def get_header_columns(self) -> str:
 1073        """
 1074        This function returns the header list of a VCF
 1075
 1076        :return: The length of the header list.
 1077        """
 1078        if self.get_header():
 1079            return self.get_header(type="list")[-1]
 1080        else:
 1081            return ""
 1082
 1083    def get_header_columns_as_list(self) -> list:
 1084        """
 1085        This function returns the header list of a VCF
 1086
 1087        :return: The length of the header list.
 1088        """
 1089        if self.get_header():
 1090            return self.get_header_columns().strip().split("\t")
 1091        else:
 1092            return []
 1093
 1094    def get_header_columns_as_sql(self) -> str:
 1095        """
 1096        This function retruns header length (without #CHROM line)
 1097
 1098        :return: The length of the header list.
 1099        """
 1100        sql_column_list = []
 1101        for col in self.get_header_columns_as_list():
 1102            sql_column_list.append(f'"{col}"')
 1103        return ",".join(sql_column_list)
 1104
 1105    def get_header_sample_list(self) -> list:
 1106        """
 1107        This function retruns header length (without #CHROM line)
 1108
 1109        :return: The length of the header list.
 1110        """
 1111        return self.header_vcf.samples
 1112
 1113    def get_verbose(self) -> bool:
 1114        """
 1115        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1116        exist
 1117
 1118        :return: The value of the key "verbose" in the config dictionary.
 1119        """
 1120        return self.get_config().get("verbose", False)
 1121
 1122    def get_connexion_format(self) -> str:
 1123        """
 1124        It returns the connexion format of the object.
 1125        :return: The connexion_format is being returned.
 1126        """
 1127        connexion_format = self.connexion_format
 1128        if connexion_format not in ["duckdb", "sqlite"]:
 1129            log.error(f"Unknown connexion format {connexion_format}")
 1130            raise ValueError(f"Unknown connexion format {connexion_format}")
 1131        else:
 1132            return connexion_format
 1133
 1134    def insert_file_to_table(
 1135        self,
 1136        file,
 1137        columns: str,
 1138        header_len: int = 0,
 1139        sep: str = "\t",
 1140        chunksize: int = 1000000,
 1141    ) -> None:
 1142        """
 1143        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1144        database format.
 1145
 1146        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1147        the path to the file on your system
 1148        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1149        should contain the names of the columns in the table where the data will be inserted. The column
 1150        names should be separated by commas within the string. For example, if you have columns named
 1151        "id", "name
 1152        :type columns: str
 1153        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1154        the number of lines to skip at the beginning of the file before reading the actual data. This
 1155        parameter allows you to skip any header information present in the file before processing the
 1156        data, defaults to 0
 1157        :type header_len: int (optional)
 1158        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1159        separator character that is used in the file being read. In this case, the default separator is
 1160        set to `\t`, which represents a tab character. You can change this parameter to a different
 1161        separator character if, defaults to \t
 1162        :type sep: str (optional)
 1163        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1164        when processing the file in chunks. In the provided code snippet, the default value for
 1165        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1166        to 1000000
 1167        :type chunksize: int (optional)
 1168        """
 1169
 1170        # Config
 1171        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1172        connexion_format = self.get_connexion_format()
 1173
 1174        log.debug("chunksize: " + str(chunksize))
 1175
 1176        if chunksize:
 1177            for chunk in pd.read_csv(
 1178                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1179            ):
 1180                if connexion_format in ["duckdb"]:
 1181                    sql_insert_into = (
 1182                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1183                    )
 1184                    self.conn.execute(sql_insert_into)
 1185                elif connexion_format in ["sqlite"]:
 1186                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1187
 1188    def load_data(
 1189        self,
 1190        input_file: str = None,
 1191        drop_variants_table: bool = False,
 1192        sample_size: int = 20480,
 1193    ) -> None:
 1194        """
 1195        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1196        table before loading the data and specify a sample size.
 1197
 1198        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1199        table
 1200        :type input_file: str
 1201        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1202        determines whether the variants table should be dropped before loading the data. If set to
 1203        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1204        not be dropped, defaults to False
 1205        :type drop_variants_table: bool (optional)
 1206        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1207        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1208        20480
 1209        :type sample_size: int (optional)
 1210        """
 1211
 1212        log.info("Loading...")
 1213
 1214        # change input file
 1215        if input_file:
 1216            self.set_input(input_file)
 1217            self.set_header()
 1218
 1219        # drop variants table
 1220        if drop_variants_table:
 1221            self.drop_variants_table()
 1222
 1223        # get table variants
 1224        table_variants = self.get_table_variants()
 1225
 1226        # Access
 1227        access = self.get_config().get("access", None)
 1228        log.debug(f"access: {access}")
 1229
 1230        # Input format and compress
 1231        input_format = self.get_input_format()
 1232        input_compressed = self.get_input_compressed()
 1233        log.debug(f"input_format: {input_format}")
 1234        log.debug(f"input_compressed: {input_compressed}")
 1235
 1236        # input_compressed_format
 1237        if input_compressed:
 1238            input_compressed_format = "gzip"
 1239        else:
 1240            input_compressed_format = "none"
 1241        log.debug(f"input_compressed_format: {input_compressed_format}")
 1242
 1243        # Connexion format
 1244        connexion_format = self.get_connexion_format()
 1245
 1246        # Sample size
 1247        if not sample_size:
 1248            sample_size = -1
 1249        log.debug(f"sample_size: {sample_size}")
 1250
 1251        # Load data
 1252        log.debug(f"Load Data from {input_format}")
 1253
 1254        # DuckDB connexion
 1255        if connexion_format in ["duckdb"]:
 1256
 1257            # Database already exists
 1258            if self.input_format in ["db", "duckdb"]:
 1259
 1260                if connexion_format in ["duckdb"]:
 1261                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1262                else:
 1263                    log.error(
 1264                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1265                    )
 1266                    raise ValueError(
 1267                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1268                    )
 1269
 1270            # Load from existing database format
 1271            else:
 1272
 1273                try:
 1274                    # Create Table or View
 1275                    database = Database(database=self.input)
 1276                    sql_from = database.get_sql_from(sample_size=sample_size)
 1277
 1278                    if access in ["RO"]:
 1279                        sql_load = (
 1280                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1281                        )
 1282                    else:
 1283                        sql_load = (
 1284                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1285                        )
 1286                    self.conn.execute(sql_load)
 1287
 1288                except:
 1289                    # Format not available
 1290                    log.error(f"Input file format '{self.input_format}' not available")
 1291                    raise ValueError(
 1292                        f"Input file format '{self.input_format}' not available"
 1293                    )
 1294
 1295        # SQLite connexion
 1296        elif connexion_format in ["sqlite"] and input_format in [
 1297            "vcf",
 1298            "tsv",
 1299            "csv",
 1300            "psv",
 1301        ]:
 1302
 1303            # Main structure
 1304            structure = {
 1305                "#CHROM": "VARCHAR",
 1306                "POS": "INTEGER",
 1307                "ID": "VARCHAR",
 1308                "REF": "VARCHAR",
 1309                "ALT": "VARCHAR",
 1310                "QUAL": "VARCHAR",
 1311                "FILTER": "VARCHAR",
 1312                "INFO": "VARCHAR",
 1313            }
 1314
 1315            # Strcuture with samples
 1316            structure_complete = structure
 1317            if self.get_header_sample_list():
 1318                structure["FORMAT"] = "VARCHAR"
 1319                for sample in self.get_header_sample_list():
 1320                    structure_complete[sample] = "VARCHAR"
 1321
 1322            # Columns list for create and insert
 1323            sql_create_table_columns = []
 1324            sql_create_table_columns_list = []
 1325            for column in structure_complete:
 1326                column_type = structure_complete[column]
 1327                sql_create_table_columns.append(
 1328                    f'"{column}" {column_type} default NULL'
 1329                )
 1330                sql_create_table_columns_list.append(f'"{column}"')
 1331
 1332            # Create database
 1333            log.debug(f"Create Table {table_variants}")
 1334            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1335            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1336            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1337            self.conn.execute(sql_create_table)
 1338
 1339            # chunksize define length of file chunk load file
 1340            chunksize = 100000
 1341
 1342            # delimiter
 1343            delimiter = file_format_delimiters.get(input_format, "\t")
 1344
 1345            # Load the input file
 1346            with open(self.input, "rt") as input_file:
 1347
 1348                # Use the appropriate file handler based on the input format
 1349                if input_compressed:
 1350                    input_file = bgzf.open(self.input, "rt")
 1351                if input_format in ["vcf"]:
 1352                    header_len = self.get_header_length()
 1353                else:
 1354                    header_len = 0
 1355
 1356                # Insert the file contents into a table
 1357                self.insert_file_to_table(
 1358                    input_file,
 1359                    columns=sql_create_table_columns_list_sql,
 1360                    header_len=header_len,
 1361                    sep=delimiter,
 1362                    chunksize=chunksize,
 1363                )
 1364
 1365        else:
 1366            log.error(
 1367                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1368            )
 1369            raise ValueError(
 1370                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1371            )
 1372
 1373        # Explode INFOS fields into table fields
 1374        if self.get_explode_infos():
 1375            self.explode_infos(
 1376                prefix=self.get_explode_infos_prefix(),
 1377                fields=self.get_explode_infos_fields(),
 1378                force=True,
 1379            )
 1380
 1381        # Create index after insertion
 1382        self.create_indexes()
 1383
 1384    def get_explode_infos(self) -> bool:
 1385        """
 1386        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1387        to False if it is not set.
 1388        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1389        value. If the parameter is not present, it will return False.
 1390        """
 1391
 1392        return self.get_param().get("explode", {}).get("explode_infos", False)
 1393
 1394    def get_explode_infos_fields(
 1395        self,
 1396        explode_infos_fields: str = None,
 1397        remove_fields_not_in_header: bool = False,
 1398    ) -> list:
 1399        """
 1400        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1401        the input parameter `explode_infos_fields`.
 1402
 1403        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1404        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1405        comma-separated list of field names to explode
 1406        :type explode_infos_fields: str
 1407        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1408        flag that determines whether to remove fields that are not present in the header. If it is set
 1409        to `True`, any field that is not in the header will be excluded from the list of exploded
 1410        information fields. If it is set to `, defaults to False
 1411        :type remove_fields_not_in_header: bool (optional)
 1412        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1413        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1414        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1415        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1416        splitting the string by commas.
 1417        """
 1418
 1419        # If no fields, get it in param
 1420        if not explode_infos_fields:
 1421            explode_infos_fields = (
 1422                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1423            )
 1424
 1425        # If no fields, defined as all fields in header using keyword
 1426        if not explode_infos_fields:
 1427            explode_infos_fields = "*"
 1428
 1429        # If fields list not empty
 1430        if explode_infos_fields:
 1431
 1432            # Input fields list
 1433            if isinstance(explode_infos_fields, str):
 1434                fields_input = explode_infos_fields.split(",")
 1435            elif isinstance(explode_infos_fields, list):
 1436                fields_input = explode_infos_fields
 1437            else:
 1438                fields_input = []
 1439
 1440            # Fields list without * keyword
 1441            fields_without_all = fields_input.copy()
 1442            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1443                fields_without_all.remove("*")
 1444
 1445            # Fields in header
 1446            fields_in_header = sorted(list(set(self.get_header().infos)))
 1447
 1448            # Construct list of fields
 1449            fields_output = []
 1450            for field in fields_input:
 1451
 1452                # Strip field
 1453                field = field.strip()
 1454
 1455                # format keyword * in regex
 1456                if field.upper() in ["*"]:
 1457                    field = ".*"
 1458
 1459                # Find all fields with pattern
 1460                r = re.compile(field)
 1461                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1462
 1463                # Remove fields input from search
 1464                if field in fields_search:
 1465                    fields_search = [field]
 1466                elif fields_search != [field]:
 1467                    fields_search = sorted(
 1468                        list(set(fields_search).difference(fields_input))
 1469                    )
 1470
 1471                # If field is not in header (avoid not well formatted header)
 1472                if not fields_search and not remove_fields_not_in_header:
 1473                    fields_search = [field]
 1474
 1475                # Add found fields
 1476                for new_field in fields_search:
 1477                    # Add field, if not already exists, and if it is in header (if asked)
 1478                    if (
 1479                        new_field not in fields_output
 1480                        and (
 1481                            not remove_fields_not_in_header
 1482                            or new_field in fields_in_header
 1483                        )
 1484                        and new_field not in [".*"]
 1485                    ):
 1486                        fields_output.append(new_field)
 1487
 1488            return fields_output
 1489
 1490        else:
 1491
 1492            return []
 1493
 1494    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1495        """
 1496        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1497        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1498        not provided.
 1499
 1500        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1501        prefix to be used for exploding or expanding information
 1502        :type explode_infos_prefix: str
 1503        :return: the value of the variable `explode_infos_prefix`.
 1504        """
 1505
 1506        if not explode_infos_prefix:
 1507            explode_infos_prefix = (
 1508                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1509            )
 1510
 1511        return explode_infos_prefix
 1512
 1513    def add_column(
 1514        self,
 1515        table_name,
 1516        column_name,
 1517        column_type,
 1518        default_value=None,
 1519        drop: bool = False,
 1520    ) -> dict:
 1521        """
 1522        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1523        doesn't already exist.
 1524
 1525        :param table_name: The name of the table to which you want to add a column
 1526        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1527        to the table
 1528        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1529        want to add to the table. It should be a string that represents the desired data type, such as
 1530        "INTEGER", "TEXT", "REAL", etc
 1531        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1532        default value for the newly added column. If a default value is provided, it will be assigned to
 1533        the column for any existing rows that do not have a value for that column
 1534        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1535        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1536        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1537        to False
 1538        :type drop: bool (optional)
 1539        :return: a boolean value indicating whether the column was successfully added to the table.
 1540        """
 1541
 1542        # added
 1543        added = False
 1544        dropped = False
 1545
 1546        # Check if the column already exists in the table
 1547        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1548        columns = self.get_query_to_df(query).columns.tolist()
 1549        if column_name.upper() in [c.upper() for c in columns]:
 1550            log.debug(
 1551                f"The {column_name} column already exists in the {table_name} table"
 1552            )
 1553            if drop:
 1554                self.drop_column(table_name=table_name, column_name=column_name)
 1555                dropped = True
 1556            else:
 1557                return None
 1558        else:
 1559            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1560
 1561        # Add column in table
 1562        add_column_query = (
 1563            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1564        )
 1565        if default_value is not None:
 1566            add_column_query += f" DEFAULT {default_value}"
 1567        self.execute_query(add_column_query)
 1568        added = not dropped
 1569        log.debug(
 1570            f"The {column_name} column was successfully added to the {table_name} table"
 1571        )
 1572
 1573        if added:
 1574            added_column = {
 1575                "table_name": table_name,
 1576                "column_name": column_name,
 1577                "column_type": column_type,
 1578                "default_value": default_value,
 1579            }
 1580        else:
 1581            added_column = None
 1582
 1583        return added_column
 1584
 1585    def drop_column(
 1586        self, column: dict = None, table_name: str = None, column_name: str = None
 1587    ) -> bool:
 1588        """
 1589        The `drop_column` function drops a specified column from a given table in a database and returns
 1590        True if the column was successfully dropped, and False if the column does not exist in the
 1591        table.
 1592
 1593        :param column: The `column` parameter is a dictionary that contains information about the column
 1594        you want to drop. It has two keys:
 1595        :type column: dict
 1596        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1597        drop a column
 1598        :type table_name: str
 1599        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1600        from the table
 1601        :type column_name: str
 1602        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1603        and False if the column does not exist in the table.
 1604        """
 1605
 1606        # Find column infos
 1607        if column:
 1608            if isinstance(column, dict):
 1609                table_name = column.get("table_name", None)
 1610                column_name = column.get("column_name", None)
 1611            elif isinstance(column, str):
 1612                table_name = self.get_table_variants()
 1613                column_name = column
 1614            else:
 1615                table_name = None
 1616                column_name = None
 1617
 1618        if not table_name and not column_name:
 1619            return False
 1620
 1621        # Removed
 1622        removed = False
 1623
 1624        # Check if the column already exists in the table
 1625        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1626        columns = self.get_query_to_df(query).columns.tolist()
 1627        if column_name in columns:
 1628            log.debug(f"The {column_name} column exists in the {table_name} table")
 1629        else:
 1630            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1631            return False
 1632
 1633        # Add column in table # ALTER TABLE integers DROP k
 1634        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1635        self.execute_query(add_column_query)
 1636        removed = True
 1637        log.debug(
 1638            f"The {column_name} column was successfully dropped to the {table_name} table"
 1639        )
 1640
 1641        return removed
 1642
 1643    def explode_infos(
 1644        self,
 1645        prefix: str = None,
 1646        create_index: bool = False,
 1647        fields: list = None,
 1648        force: bool = False,
 1649        proccess_all_fields_together: bool = False,
 1650        table: str = None,
 1651    ) -> list:
 1652        """
 1653        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1654        individual columns, returning a list of added columns.
 1655
 1656        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1657        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1658        `self.get_explode_infos_prefix()` as the prefix
 1659        :type prefix: str
 1660        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1661        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1662        `False`, indexes will not be created. The default value is `False`, defaults to False
 1663        :type create_index: bool (optional)
 1664        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1665        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1666        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1667        a list to the `
 1668        :type fields: list
 1669        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1670        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1671        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1672        defaults to False
 1673        :type force: bool (optional)
 1674        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1675        flag that determines whether to process all the INFO fields together or individually. If set to
 1676        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1677        be processed individually. The default value is, defaults to False
 1678        :type proccess_all_fields_together: bool (optional)
 1679        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1680        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1681        a value for the `table` parameter, the function will use that table name. If the `table`
 1682        parameter is
 1683        :type table: str
 1684        :return: The `explode_infos` function returns a list of added columns.
 1685        """
 1686
 1687        # drop indexes
 1688        self.drop_indexes()
 1689
 1690        # connexion format
 1691        connexion_format = self.get_connexion_format()
 1692
 1693        # Access
 1694        access = self.get_config().get("access", None)
 1695
 1696        # Added columns
 1697        added_columns = []
 1698
 1699        if access not in ["RO"]:
 1700
 1701            # prefix
 1702            if prefix in [None, True] or not isinstance(prefix, str):
 1703                if self.get_explode_infos_prefix() not in [None, True]:
 1704                    prefix = self.get_explode_infos_prefix()
 1705                else:
 1706                    prefix = "INFO/"
 1707
 1708            # table variants
 1709            if table is not None:
 1710                table_variants = table
 1711            else:
 1712                table_variants = self.get_table_variants(clause="select")
 1713
 1714            # extra infos
 1715            try:
 1716                extra_infos = self.get_extra_infos()
 1717            except:
 1718                extra_infos = []
 1719
 1720            # Header infos
 1721            header_infos = self.get_header().infos
 1722
 1723            log.debug(
 1724                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1725            )
 1726
 1727            sql_info_alter_table_array = []
 1728
 1729            # Info fields to check
 1730            fields_list = list(header_infos)
 1731            if fields:
 1732                fields_list += fields
 1733            fields_list = set(fields_list)
 1734
 1735            # If no fields
 1736            if not fields:
 1737                fields = []
 1738
 1739            # Translate fields if patterns
 1740            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1741
 1742            for info in fields:
 1743
 1744                info_id_sql = prefix + info
 1745
 1746                if (
 1747                    info in fields_list
 1748                    or prefix + info in fields_list
 1749                    or info in extra_infos
 1750                ):
 1751
 1752                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1753
 1754                    if info in header_infos:
 1755                        info_type = header_infos[info].type
 1756                        info_num = header_infos[info].num
 1757                    else:
 1758                        info_type = "String"
 1759                        info_num = 0
 1760
 1761                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1762                    if info_num != 1:
 1763                        type_sql = "VARCHAR"
 1764
 1765                    # Add field
 1766                    added_column = self.add_column(
 1767                        table_name=table_variants,
 1768                        column_name=info_id_sql,
 1769                        column_type=type_sql,
 1770                        default_value="null",
 1771                        drop=force,
 1772                    )
 1773
 1774                    if added_column:
 1775                        added_columns.append(added_column)
 1776
 1777                    if added_column or force:
 1778
 1779                        # add field to index
 1780                        self.index_additionnal_fields.append(info_id_sql)
 1781
 1782                        # Update field array
 1783                        if connexion_format in ["duckdb"]:
 1784                            update_info_field = f"""
 1785                            "{info_id_sql}" =
 1786                                CASE
 1787                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1788                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1789                                END
 1790                            """
 1791                        elif connexion_format in ["sqlite"]:
 1792                            update_info_field = f"""
 1793                                "{info_id_sql}" =
 1794                                    CASE
 1795                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1796                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1797                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1798                                    END
 1799                            """
 1800
 1801                        sql_info_alter_table_array.append(update_info_field)
 1802
 1803            if sql_info_alter_table_array:
 1804
 1805                # By chromosomes
 1806                try:
 1807                    chromosomes_list = list(
 1808                        self.get_query_to_df(
 1809                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1810                        )["#CHROM"]
 1811                    )
 1812                except:
 1813                    chromosomes_list = [None]
 1814
 1815                for chrom in chromosomes_list:
 1816                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1817
 1818                    # Where clause
 1819                    where_clause = ""
 1820                    if chrom and len(chromosomes_list) > 1:
 1821                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1822
 1823                    # Update table
 1824                    if proccess_all_fields_together:
 1825                        sql_info_alter_table_array_join = ", ".join(
 1826                            sql_info_alter_table_array
 1827                        )
 1828                        if sql_info_alter_table_array_join:
 1829                            sql_info_alter_table = f"""
 1830                                UPDATE {table_variants}
 1831                                SET {sql_info_alter_table_array_join}
 1832                                {where_clause}
 1833                                """
 1834                            log.debug(
 1835                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1836                            )
 1837                            # log.debug(sql_info_alter_table)
 1838                            self.conn.execute(sql_info_alter_table)
 1839                    else:
 1840                        sql_info_alter_num = 0
 1841                        for sql_info_alter in sql_info_alter_table_array:
 1842                            sql_info_alter_num += 1
 1843                            sql_info_alter_table = f"""
 1844                                UPDATE {table_variants}
 1845                                SET {sql_info_alter}
 1846                                {where_clause}
 1847                                """
 1848                            log.debug(
 1849                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1850                            )
 1851                            # log.debug(sql_info_alter_table)
 1852                            self.conn.execute(sql_info_alter_table)
 1853
 1854        # create indexes
 1855        if create_index:
 1856            self.create_indexes()
 1857
 1858        return added_columns
 1859
 1860    def create_indexes(self) -> None:
 1861        """
 1862        Create indexes on the table after insertion
 1863        """
 1864
 1865        # Access
 1866        access = self.get_config().get("access", None)
 1867
 1868        # get table variants
 1869        table_variants = self.get_table_variants("FROM")
 1870
 1871        if self.get_indexing() and access not in ["RO"]:
 1872            # Create index
 1873            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 1874            self.conn.execute(sql_create_table_index)
 1875            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 1876            self.conn.execute(sql_create_table_index)
 1877            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 1878            self.conn.execute(sql_create_table_index)
 1879            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 1880            self.conn.execute(sql_create_table_index)
 1881            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 1882            self.conn.execute(sql_create_table_index)
 1883            for field in self.index_additionnal_fields:
 1884                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 1885                self.conn.execute(sql_create_table_index)
 1886
 1887    def drop_indexes(self) -> None:
 1888        """
 1889        Create indexes on the table after insertion
 1890        """
 1891
 1892        # Access
 1893        access = self.get_config().get("access", None)
 1894
 1895        # get table variants
 1896        table_variants = self.get_table_variants("FROM")
 1897
 1898        # Get database format
 1899        connexion_format = self.get_connexion_format()
 1900
 1901        if access not in ["RO"]:
 1902            if connexion_format in ["duckdb"]:
 1903                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 1904            elif connexion_format in ["sqlite"]:
 1905                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 1906
 1907            list_indexes = self.conn.execute(sql_list_indexes)
 1908            index_names = [row[0] for row in list_indexes.fetchall()]
 1909            for index in index_names:
 1910                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 1911                self.conn.execute(sql_drop_table_index)
 1912
 1913    def read_vcf_header(self, f) -> list:
 1914        """
 1915        It reads the header of a VCF file and returns a list of the header lines
 1916
 1917        :param f: the file object
 1918        :return: The header lines of the VCF file.
 1919        """
 1920
 1921        header_list = []
 1922        for line in f:
 1923            header_list.append(line)
 1924            if line.startswith("#CHROM"):
 1925                break
 1926        return header_list
 1927
 1928    def read_vcf_header_file(self, file: str = None) -> list:
 1929        """
 1930        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 1931        uncompressed files.
 1932
 1933        :param file: The `file` parameter is a string that represents the path to the VCF header file
 1934        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 1935        default to `None`
 1936        :type file: str
 1937        :return: The function `read_vcf_header_file` returns a list.
 1938        """
 1939
 1940        if self.get_input_compressed(input_file=file):
 1941            with bgzf.open(file, "rt") as f:
 1942                return self.read_vcf_header(f=f)
 1943        else:
 1944            with open(file, "rt") as f:
 1945                return self.read_vcf_header(f=f)
 1946
 1947    def execute_query(self, query: str):
 1948        """
 1949        It takes a query as an argument, executes it, and returns the results
 1950
 1951        :param query: The query to be executed
 1952        :return: The result of the query is being returned.
 1953        """
 1954        if query:
 1955            return self.conn.execute(query)  # .fetchall()
 1956        else:
 1957            return None
 1958
 1959    def export_output(
 1960        self,
 1961        output_file: str | None = None,
 1962        output_header: str | None = None,
 1963        export_header: bool = True,
 1964        query: str | None = None,
 1965        parquet_partitions: list | None = None,
 1966        chunk_size: int | None = None,
 1967        threads: int | None = None,
 1968        sort: bool = False,
 1969        index: bool = False,
 1970        order_by: str | None = None,
 1971    ) -> bool:
 1972        """
 1973        The `export_output` function exports data from a VCF file to a specified output file in various
 1974        formats, including VCF, CSV, TSV, PSV, and Parquet.
 1975
 1976        :param output_file: The `output_file` parameter is a string that specifies the name of the
 1977        output file to be generated by the function. This is where the exported data will be saved
 1978        :type output_file: str
 1979        :param output_header: The `output_header` parameter is a string that specifies the name of the
 1980        file where the header of the VCF file will be exported. If this parameter is not provided, the
 1981        header will be exported to a file with the same name as the `output_file` parameter, but with
 1982        the extension "
 1983        :type output_header: str
 1984        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 1985        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 1986        True, the header will be exported to a file. If `export_header` is False, the header will not
 1987        be, defaults to True, if output format is not VCF
 1988        :type export_header: bool (optional)
 1989        :param query: The `query` parameter is an optional SQL query that can be used to filter and
 1990        select specific data from the VCF file before exporting it. If provided, only the data that
 1991        matches the query will be exported
 1992        :type query: str
 1993        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 1994        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 1995        organize data in a hierarchical directory structure based on the values of one or more columns.
 1996        This can improve query performance when working with large datasets
 1997        :type parquet_partitions: list
 1998        :param chunk_size: The `chunk_size` parameter specifies the number of
 1999        records in batch when exporting data in Parquet format. This parameter is used for
 2000        partitioning the Parquet file into multiple files.
 2001        :type chunk_size: int
 2002        :param threads: The `threads` parameter is an optional parameter that specifies the number of
 2003        threads to be used during the export process. It determines the level of parallelism and can
 2004        improve the performance of the export operation. If not provided, the function will use the
 2005        default number of threads
 2006        :type threads: int
 2007        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
 2008        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
 2009        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
 2010        False
 2011        :type sort: bool (optional)
 2012        :param index: The `index` parameter is a boolean flag that determines whether an index should be
 2013        created on the output file. If `index` is True, an index will be created. If `index` is False,
 2014        no index will be created. The default value is False, defaults to False
 2015        :type index: bool (optional)
 2016        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
 2017        sorting the output file. This parameter is only applicable when exporting data in VCF format
 2018        :type order_by: str
 2019        :return: a boolean value. It checks if the output file exists and returns True if it does, or
 2020        None if it doesn't.
 2021        """
 2022
 2023        # Log
 2024        log.info("Exporting...")
 2025
 2026        # Full path
 2027        output_file = full_path(output_file)
 2028        output_header = full_path(output_header)
 2029
 2030        # Config
 2031        config = self.get_config()
 2032
 2033        # Param
 2034        param = self.get_param()
 2035
 2036        # Tmp files to remove
 2037        tmp_to_remove = []
 2038
 2039        # If no output, get it
 2040        if not output_file:
 2041            output_file = self.get_output()
 2042
 2043        # If not threads
 2044        if not threads:
 2045            threads = self.get_threads()
 2046
 2047        # Auto header name with extension
 2048        if export_header or output_header:
 2049            if not output_header:
 2050                output_header = f"{output_file}.hdr"
 2051            # Export header
 2052            self.export_header(output_file=output_file)
 2053
 2054        # Switch off export header if VCF output
 2055        output_file_type = get_file_format(output_file)
 2056        if output_file_type in ["vcf"]:
 2057            export_header = False
 2058            tmp_to_remove.append(output_header)
 2059
 2060        # Chunk size
 2061        if not chunk_size:
 2062            chunk_size = config.get("chunk_size", None)
 2063
 2064        # Parquet partition
 2065        if not parquet_partitions:
 2066            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2067        if parquet_partitions and isinstance(parquet_partitions, str):
 2068            parquet_partitions = parquet_partitions.split(",")
 2069
 2070        # Order by
 2071        if not order_by:
 2072            order_by = param.get("export", {}).get("order_by", "")
 2073
 2074        # Header in output
 2075        header_in_output = param.get("export", {}).get("include_header", False)
 2076
 2077        # Database
 2078        database_source = self.get_connexion()
 2079
 2080        # Connexion format
 2081        connexion_format = self.get_connexion_format()
 2082
 2083        # Explode infos
 2084        if self.get_explode_infos():
 2085            self.explode_infos(
 2086                prefix=self.get_explode_infos_prefix(),
 2087                fields=self.get_explode_infos_fields(),
 2088                force=False,
 2089            )
 2090
 2091        # if connexion_format in ["sqlite"] or query:
 2092        if connexion_format in ["sqlite"]:
 2093
 2094            # Export in Parquet
 2095            random_tmp = "".join(
 2096                random.choice(string.ascii_lowercase) for i in range(10)
 2097            )
 2098            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2099            tmp_to_remove.append(database_source)
 2100
 2101            # Table Variants
 2102            table_variants = self.get_table_variants()
 2103
 2104            # Create export query
 2105            sql_query_export_subquery = f"""
 2106                SELECT * FROM {table_variants}
 2107                """
 2108
 2109            # Write source file
 2110            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2111
 2112        # Create database
 2113        database = Database(
 2114            database=database_source,
 2115            table="variants",
 2116            header_file=output_header,
 2117            conn_config=self.get_connexion_config(),
 2118        )
 2119
 2120        # Existing colomns header
 2121        # existing_columns_header = database.get_header_file_columns(output_header)
 2122        existing_columns_header = database.get_header_columns_from_database()
 2123
 2124        # Export file
 2125        database.export(
 2126            output_database=output_file,
 2127            output_header=output_header,
 2128            existing_columns_header=existing_columns_header,
 2129            parquet_partitions=parquet_partitions,
 2130            chunk_size=chunk_size,
 2131            threads=threads,
 2132            sort=sort,
 2133            index=index,
 2134            header_in_output=header_in_output,
 2135            order_by=order_by,
 2136            query=query,
 2137            export_header=export_header,
 2138        )
 2139
 2140        # Remove
 2141        remove_if_exists(tmp_to_remove)
 2142
 2143        return (os.path.exists(output_file) or None) and (
 2144            os.path.exists(output_file) or None
 2145        )
 2146
 2147    def get_extra_infos(self, table: str = None) -> list:
 2148        """
 2149        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2150        in the header.
 2151
 2152        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2153        name of the table from which you want to retrieve the extra columns that are not present in the
 2154        header. If the `table` parameter is not provided when calling the function, it will default to
 2155        using the variants
 2156        :type table: str
 2157        :return: A list of columns that are in the specified table but not in the header of the table.
 2158        """
 2159
 2160        header_columns = []
 2161
 2162        if not table:
 2163            table = self.get_table_variants(clause="from")
 2164            header_columns = self.get_header_columns()
 2165
 2166        # Check all columns in the database
 2167        query = f""" SELECT * FROM {table} LIMIT 1 """
 2168        log.debug(f"query {query}")
 2169        table_columns = self.get_query_to_df(query).columns.tolist()
 2170        extra_columns = []
 2171
 2172        # Construct extra infos (not in header)
 2173        for column in table_columns:
 2174            if column not in header_columns:
 2175                extra_columns.append(column)
 2176
 2177        return extra_columns
 2178
 2179    def get_extra_infos_sql(self, table: str = None) -> str:
 2180        """
 2181        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2182        by double quotes
 2183
 2184        :param table: The name of the table to get the extra infos from. If None, the default table is
 2185        used
 2186        :type table: str
 2187        :return: A string of the extra infos
 2188        """
 2189
 2190        return ", ".join(
 2191            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2192        )
 2193
 2194    def export_header(
 2195        self,
 2196        header_name: str = None,
 2197        output_file: str = None,
 2198        output_file_ext: str = ".hdr",
 2199        clean_header: bool = True,
 2200        remove_chrom_line: bool = False,
 2201    ) -> str:
 2202        """
 2203        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2204        specified options, and writes it to a new file.
 2205
 2206        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2207        this parameter is not specified, the header will be written to the output file
 2208        :type header_name: str
 2209        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2210        specify the name of the output file where the header will be written. If this parameter is not
 2211        provided, the header will be written to a temporary file
 2212        :type output_file: str
 2213        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2214        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2215        if not specified by the user. This extension will be appended to the `output_file` name to
 2216        create the final, defaults to .hdr
 2217        :type output_file_ext: str (optional)
 2218        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2219        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2220        `True`, the function will clean the header by modifying certain lines based on a specific
 2221        pattern. If `clean_header`, defaults to True
 2222        :type clean_header: bool (optional)
 2223        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2224        boolean flag that determines whether the #CHROM line should be removed from the header before
 2225        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2226        defaults to False
 2227        :type remove_chrom_line: bool (optional)
 2228        :return: The function `export_header` returns the name of the temporary header file that is
 2229        created.
 2230        """
 2231
 2232        if not header_name and not output_file:
 2233            output_file = self.get_output()
 2234
 2235        if self.get_header():
 2236
 2237            # Get header object
 2238            header_obj = self.get_header()
 2239
 2240            # Create database
 2241            db_for_header = Database(database=self.get_input())
 2242
 2243            # Get real columns in the file
 2244            db_header_columns = db_for_header.get_columns()
 2245
 2246            with tempfile.TemporaryDirectory() as tmpdir:
 2247
 2248                # Write header file
 2249                header_file_tmp = os.path.join(tmpdir, "header")
 2250                f = open(header_file_tmp, "w")
 2251                vcf.Writer(f, header_obj)
 2252                f.close()
 2253
 2254                # Replace #CHROM line with rel columns
 2255                header_list = db_for_header.read_header_file(
 2256                    header_file=header_file_tmp
 2257                )
 2258                header_list[-1] = "\t".join(db_header_columns)
 2259
 2260                # Remove CHROM line
 2261                if remove_chrom_line:
 2262                    header_list.pop()
 2263
 2264                # Clean header
 2265                if clean_header:
 2266                    header_list_clean = []
 2267                    for head in header_list:
 2268                        # Clean head for malformed header
 2269                        head_clean = head
 2270                        head_clean = re.subn(
 2271                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2272                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2273                            head_clean,
 2274                            2,
 2275                        )[0]
 2276                        # Write header
 2277                        header_list_clean.append(head_clean)
 2278                    header_list = header_list_clean
 2279
 2280            tmp_header_name = output_file + output_file_ext
 2281
 2282            f = open(tmp_header_name, "w")
 2283            for line in header_list:
 2284                f.write(line)
 2285            f.close()
 2286
 2287        return tmp_header_name
 2288
 2289    def export_variant_vcf(
 2290        self,
 2291        vcf_file,
 2292        remove_info: bool = False,
 2293        add_samples: bool = True,
 2294        list_samples: list = [],
 2295        where_clause: str = "",
 2296        index: bool = False,
 2297        threads: int | None = None,
 2298    ) -> bool | None:
 2299        """
 2300        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2301        remove INFO field, add samples, and control compression and indexing.
 2302
 2303        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2304        written to. It is the output file that will contain the filtered VCF data based on the specified
 2305        parameters
 2306        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2307        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2308        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2309        in, defaults to False
 2310        :type remove_info: bool (optional)
 2311        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2312        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2313        If set to False, the samples will be removed. The default value is True, defaults to True
 2314        :type add_samples: bool (optional)
 2315        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2316        in the output VCF file. By default, all samples will be included. If you provide a list of
 2317        samples, only those samples will be included in the output file
 2318        :type list_samples: list
 2319        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2320        determines whether or not to create an index for the output VCF file. If `index` is set to
 2321        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2322        :type index: bool (optional)
 2323        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2324        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2325        will be used during the export process. More threads can potentially speed up the export process
 2326        by utilizing multiple cores of the processor. If
 2327        :type threads: int | None
 2328        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2329        method with various parameters including the output file, query, threads, sort flag, and index
 2330        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2331        specified parameters and configurations provided in the `export_variant_vcf` function.
 2332        """
 2333
 2334        # Config
 2335        config = self.get_config()
 2336
 2337        # Extract VCF
 2338        log.debug("Export VCF...")
 2339
 2340        # Table variants
 2341        table_variants = self.get_table_variants()
 2342
 2343        # Threads
 2344        if not threads:
 2345            threads = self.get_threads()
 2346
 2347        # Info fields
 2348        if remove_info:
 2349            if not isinstance(remove_info, str):
 2350                remove_info = "."
 2351            info_field = f"""'{remove_info}' as INFO"""
 2352        else:
 2353            info_field = "INFO"
 2354
 2355        # Samples fields
 2356        if add_samples:
 2357            if not list_samples:
 2358                list_samples = self.get_header_sample_list()
 2359            if list_samples:
 2360                samples_fields = " , FORMAT , " + " , ".join(list_samples)
 2361            else:
 2362                samples_fields = ""
 2363            log.debug(f"samples_fields: {samples_fields}")
 2364        else:
 2365            samples_fields = ""
 2366
 2367        # Where clause
 2368        if where_clause is None:
 2369            where_clause = ""
 2370
 2371        # Variants
 2372        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2373        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2374        log.debug(f"sql_query_select={sql_query_select}")
 2375
 2376        return self.export_output(
 2377            output_file=vcf_file,
 2378            output_header=None,
 2379            export_header=True,
 2380            query=sql_query_select,
 2381            parquet_partitions=None,
 2382            chunk_size=config.get("chunk_size", None),
 2383            threads=threads,
 2384            sort=True,
 2385            index=index,
 2386            order_by=None,
 2387        )
 2388
 2389    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2390        """
 2391        It takes a list of commands and runs them in parallel using the number of threads specified
 2392
 2393        :param commands: A list of commands to run
 2394        :param threads: The number of threads to use, defaults to 1 (optional)
 2395        """
 2396
 2397        run_parallel_commands(commands, threads)
 2398
 2399    def get_threads(self, default: int = 1) -> int:
 2400        """
 2401        This function returns the number of threads to use for a job, with a default value of 1 if not
 2402        specified.
 2403
 2404        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2405        default number of threads to use if no specific value is provided. If no value is provided for
 2406        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2407        used, defaults to 1
 2408        :type default: int (optional)
 2409        :return: the number of threads to use for the current job.
 2410        """
 2411
 2412        # Config
 2413        config = self.get_config()
 2414
 2415        # Param
 2416        param = self.get_param()
 2417
 2418        # Input threads
 2419        input_thread = param.get("threads", config.get("threads", None))
 2420
 2421        # Check threads
 2422        if not input_thread:
 2423            threads = default
 2424        elif int(input_thread) <= 0:
 2425            threads = os.cpu_count()
 2426        else:
 2427            threads = int(input_thread)
 2428        return threads
 2429
 2430    def get_memory(self, default: str = None) -> str:
 2431        """
 2432        This function retrieves the memory value from parameters or configuration with a default value
 2433        if not found.
 2434
 2435        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2436        default value is used as a fallback in case the `memory` parameter is not provided in the
 2437        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2438        the function
 2439        :type default: str
 2440        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2441        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2442        return the default value provided as an argument to the function.
 2443        """
 2444
 2445        # Config
 2446        config = self.get_config()
 2447
 2448        # Param
 2449        param = self.get_param()
 2450
 2451        # Input threads
 2452        input_memory = param.get("memory", config.get("memory", None))
 2453
 2454        # Check threads
 2455        if input_memory:
 2456            memory = input_memory
 2457        else:
 2458            memory = default
 2459
 2460        return memory
 2461
 2462    def update_from_vcf(self, vcf_file: str) -> None:
 2463        """
 2464        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2465
 2466        :param vcf_file: the path to the VCF file
 2467        """
 2468
 2469        connexion_format = self.get_connexion_format()
 2470
 2471        if connexion_format in ["duckdb"]:
 2472            self.update_from_vcf_duckdb(vcf_file)
 2473        elif connexion_format in ["sqlite"]:
 2474            self.update_from_vcf_sqlite(vcf_file)
 2475
 2476    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2477        """
 2478        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2479        INFO column of the VCF file
 2480
 2481        :param vcf_file: the path to the VCF file
 2482        """
 2483
 2484        # varaints table
 2485        table_variants = self.get_table_variants()
 2486
 2487        # Loading VCF into temporaire table
 2488        skip = self.get_header_length(file=vcf_file)
 2489        vcf_df = pd.read_csv(
 2490            vcf_file,
 2491            sep="\t",
 2492            engine="c",
 2493            skiprows=skip,
 2494            header=0,
 2495            low_memory=False,
 2496        )
 2497        sql_query_update = f"""
 2498        UPDATE {table_variants} as table_variants
 2499            SET INFO = concat(
 2500                            CASE
 2501                                WHEN INFO NOT IN ('', '.')
 2502                                THEN INFO
 2503                                ELSE ''
 2504                            END,
 2505                            (
 2506                                SELECT 
 2507                                    concat(
 2508                                        CASE
 2509                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2510                                            THEN ';'
 2511                                            ELSE ''
 2512                                        END
 2513                                        ,
 2514                                        CASE
 2515                                            WHEN table_parquet.INFO NOT IN ('','.')
 2516                                            THEN table_parquet.INFO
 2517                                            ELSE ''
 2518                                        END
 2519                                    )
 2520                                FROM vcf_df as table_parquet
 2521                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2522                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2523                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2524                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2525                                        AND table_parquet.INFO NOT IN ('','.')
 2526                            )
 2527                        )
 2528            ;
 2529            """
 2530        self.conn.execute(sql_query_update)
 2531
 2532    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2533        """
 2534        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2535        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2536        table
 2537
 2538        :param vcf_file: The path to the VCF file you want to update the database with
 2539        """
 2540
 2541        # Create a temporary table for the VCF
 2542        table_vcf = "tmp_vcf"
 2543        sql_create = (
 2544            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2545        )
 2546        self.conn.execute(sql_create)
 2547
 2548        # Loading VCF into temporaire table
 2549        vcf_df = pd.read_csv(
 2550            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2551        )
 2552        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2553        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2554
 2555        # Update table 'variants' with VCF data
 2556        # warning: CONCAT as || operator
 2557        sql_query_update = f"""
 2558            UPDATE variants as table_variants
 2559            SET INFO = CASE
 2560                            WHEN INFO NOT IN ('', '.')
 2561                            THEN INFO
 2562                            ELSE ''
 2563                        END ||
 2564                        (
 2565                        SELECT 
 2566                            CASE 
 2567                                WHEN table_variants.INFO NOT IN ('','.') 
 2568                                    AND table_vcf.INFO NOT IN ('','.')  
 2569                                THEN ';' 
 2570                                ELSE '' 
 2571                            END || 
 2572                            CASE 
 2573                                WHEN table_vcf.INFO NOT IN ('','.') 
 2574                                THEN table_vcf.INFO 
 2575                                ELSE '' 
 2576                            END
 2577                        FROM {table_vcf} as table_vcf
 2578                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2579                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2580                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2581                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2582                        )
 2583        """
 2584        self.conn.execute(sql_query_update)
 2585
 2586        # Drop temporary table
 2587        sql_drop = f"DROP TABLE {table_vcf}"
 2588        self.conn.execute(sql_drop)
 2589
 2590    def drop_variants_table(self) -> None:
 2591        """
 2592        > This function drops the variants table
 2593        """
 2594
 2595        table_variants = self.get_table_variants()
 2596        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2597        self.conn.execute(sql_table_variants)
 2598
 2599    def set_variant_id(
 2600        self, variant_id_column: str = "variant_id", force: bool = None
 2601    ) -> str:
 2602        """
 2603        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2604        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2605
 2606        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2607        to variant_id
 2608        :type variant_id_column: str (optional)
 2609        :param force: If True, the variant_id column will be created even if it already exists
 2610        :type force: bool
 2611        :return: The name of the column that contains the variant_id
 2612        """
 2613
 2614        # Assembly
 2615        assembly = self.get_param().get(
 2616            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2617        )
 2618
 2619        # INFO/Tag prefix
 2620        prefix = self.get_explode_infos_prefix()
 2621
 2622        # Explode INFO/SVTYPE
 2623        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2624
 2625        # variants table
 2626        table_variants = self.get_table_variants()
 2627
 2628        # variant_id column
 2629        if not variant_id_column:
 2630            variant_id_column = "variant_id"
 2631
 2632        # Creta variant_id column
 2633        if "variant_id" not in self.get_extra_infos() or force:
 2634
 2635            # Create column
 2636            self.add_column(
 2637                table_name=table_variants,
 2638                column_name=variant_id_column,
 2639                column_type="UBIGINT",
 2640                default_value="0",
 2641            )
 2642
 2643            # Update column
 2644            self.conn.execute(
 2645                f"""
 2646                    UPDATE {table_variants}
 2647                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2648                """
 2649            )
 2650
 2651        # Remove added columns
 2652        for added_column in added_columns:
 2653            self.drop_column(column=added_column)
 2654
 2655        # return variant_id column name
 2656        return variant_id_column
 2657
 2658    def get_variant_id_column(
 2659        self, variant_id_column: str = "variant_id", force: bool = None
 2660    ) -> str:
 2661        """
 2662        This function returns the variant_id column name
 2663
 2664        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2665        defaults to variant_id
 2666        :type variant_id_column: str (optional)
 2667        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2668        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2669        if it is not already set, or if it is set
 2670        :type force: bool
 2671        :return: The variant_id column name.
 2672        """
 2673
 2674        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2675
 2676    ###
 2677    # Annotation
 2678    ###
 2679
 2680    def scan_databases(
 2681        self,
 2682        database_formats: list = ["parquet"],
 2683        database_releases: list = ["current"],
 2684    ) -> dict:
 2685        """
 2686        The function `scan_databases` scans for available databases based on specified formats and
 2687        releases.
 2688
 2689        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2690        of the databases to be scanned. In this case, the accepted format is "parquet"
 2691        :type database_formats: list ["parquet"]
 2692        :param database_releases: The `database_releases` parameter is a list that specifies the
 2693        releases of the databases to be scanned. In the provided function, the default value for
 2694        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2695        databases that are in the "current"
 2696        :type database_releases: list
 2697        :return: The function `scan_databases` returns a dictionary containing information about
 2698        databases that match the specified formats and releases.
 2699        """
 2700
 2701        # Config
 2702        config = self.get_config()
 2703
 2704        # Param
 2705        param = self.get_param()
 2706
 2707        # Param - Assembly
 2708        assembly = param.get("assembly", config.get("assembly", None))
 2709        if not assembly:
 2710            assembly = DEFAULT_ASSEMBLY
 2711            log.warning(f"Default assembly '{assembly}'")
 2712
 2713        # Scan for availabled databases
 2714        log.info(
 2715            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2716        )
 2717        databases_infos_dict = databases_infos(
 2718            database_folder_releases=database_releases,
 2719            database_formats=database_formats,
 2720            assembly=assembly,
 2721            config=config,
 2722        )
 2723        log.info(
 2724            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2725        )
 2726
 2727        return databases_infos_dict
 2728
 2729    def annotation(self) -> None:
 2730        """
 2731        It annotates the VCF file with the annotations specified in the config file.
 2732        """
 2733
 2734        # Config
 2735        config = self.get_config()
 2736
 2737        # Param
 2738        param = self.get_param()
 2739
 2740        # Param - Assembly
 2741        assembly = param.get("assembly", config.get("assembly", None))
 2742        if not assembly:
 2743            assembly = DEFAULT_ASSEMBLY
 2744            log.warning(f"Default assembly '{assembly}'")
 2745
 2746        # annotations databases folders
 2747        annotations_databases = set(
 2748            config.get("folders", {})
 2749            .get("databases", {})
 2750            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2751            + config.get("folders", {})
 2752            .get("databases", {})
 2753            .get("parquet", ["~/howard/databases/parquet/current"])
 2754            + config.get("folders", {})
 2755            .get("databases", {})
 2756            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2757        )
 2758
 2759        # Get param annotations
 2760        if param.get("annotations", None) and isinstance(
 2761            param.get("annotations", None), str
 2762        ):
 2763            log.debug(param.get("annotations", None))
 2764            param_annotation_list = param.get("annotations").split(",")
 2765        else:
 2766            param_annotation_list = []
 2767
 2768        # Each tools param
 2769        if param.get("annotation_parquet", None) != None:
 2770            log.debug(
 2771                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2772            )
 2773            if isinstance(param.get("annotation_parquet", None), list):
 2774                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2775            else:
 2776                param_annotation_list.append(param.get("annotation_parquet"))
 2777        if param.get("annotation_snpsift", None) != None:
 2778            if isinstance(param.get("annotation_snpsift", None), list):
 2779                param_annotation_list.append(
 2780                    "snpsift:"
 2781                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2782                )
 2783            else:
 2784                param_annotation_list.append(
 2785                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2786                )
 2787        if param.get("annotation_snpeff", None) != None:
 2788            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2789        if param.get("annotation_bcftools", None) != None:
 2790            if isinstance(param.get("annotation_bcftools", None), list):
 2791                param_annotation_list.append(
 2792                    "bcftools:"
 2793                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2794                )
 2795            else:
 2796                param_annotation_list.append(
 2797                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2798                )
 2799        if param.get("annotation_annovar", None) != None:
 2800            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2801        if param.get("annotation_exomiser", None) != None:
 2802            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2803        if param.get("annotation_splice", None) != None:
 2804            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2805
 2806        # Merge param annotations list
 2807        param["annotations"] = ",".join(param_annotation_list)
 2808
 2809        # debug
 2810        log.debug(f"param_annotations={param['annotations']}")
 2811
 2812        if param.get("annotations"):
 2813
 2814            # Log
 2815            # log.info("Annotations - Check annotation parameters")
 2816
 2817            if not "annotation" in param:
 2818                param["annotation"] = {}
 2819
 2820            # List of annotations parameters
 2821            annotations_list_input = {}
 2822            if isinstance(param.get("annotations", None), str):
 2823                annotation_file_list = [
 2824                    value for value in param.get("annotations", "").split(",")
 2825                ]
 2826                for annotation_file in annotation_file_list:
 2827                    annotations_list_input[annotation_file] = {"INFO": None}
 2828            else:
 2829                annotations_list_input = param.get("annotations", {})
 2830
 2831            log.info(f"Quick Annotations:")
 2832            for annotation_key in list(annotations_list_input.keys()):
 2833                log.info(f"   {annotation_key}")
 2834
 2835            # List of annotations and associated fields
 2836            annotations_list = {}
 2837
 2838            for annotation_file in annotations_list_input:
 2839
 2840                # Explode annotations if ALL
 2841                if (
 2842                    annotation_file.upper() == "ALL"
 2843                    or annotation_file.upper().startswith("ALL:")
 2844                ):
 2845
 2846                    # check ALL parameters (formats, releases)
 2847                    annotation_file_split = annotation_file.split(":")
 2848                    database_formats = "parquet"
 2849                    database_releases = "current"
 2850                    for annotation_file_option in annotation_file_split[1:]:
 2851                        database_all_options_split = annotation_file_option.split("=")
 2852                        if database_all_options_split[0] == "format":
 2853                            database_formats = database_all_options_split[1].split("+")
 2854                        if database_all_options_split[0] == "release":
 2855                            database_releases = database_all_options_split[1].split("+")
 2856
 2857                    # Scan for availabled databases
 2858                    databases_infos_dict = self.scan_databases(
 2859                        database_formats=database_formats,
 2860                        database_releases=database_releases,
 2861                    )
 2862
 2863                    # Add found databases in annotation parameters
 2864                    for database_infos in databases_infos_dict.keys():
 2865                        annotations_list[database_infos] = {"INFO": None}
 2866
 2867                else:
 2868                    annotations_list[annotation_file] = annotations_list_input[
 2869                        annotation_file
 2870                    ]
 2871
 2872            # Check each databases
 2873            if len(annotations_list):
 2874
 2875                log.info(
 2876                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 2877                )
 2878
 2879                for annotation_file in annotations_list:
 2880
 2881                    # Init
 2882                    annotations = annotations_list.get(annotation_file, None)
 2883
 2884                    # Annotation snpEff
 2885                    if annotation_file.startswith("snpeff"):
 2886
 2887                        log.debug(f"Quick Annotation snpEff")
 2888
 2889                        if "snpeff" not in param["annotation"]:
 2890                            param["annotation"]["snpeff"] = {}
 2891
 2892                        if "options" not in param["annotation"]["snpeff"]:
 2893                            param["annotation"]["snpeff"]["options"] = ""
 2894
 2895                        # snpEff options in annotations
 2896                        param["annotation"]["snpeff"]["options"] = "".join(
 2897                            annotation_file.split(":")[1:]
 2898                        )
 2899
 2900                    # Annotation Annovar
 2901                    elif annotation_file.startswith("annovar"):
 2902
 2903                        log.debug(f"Quick Annotation Annovar")
 2904
 2905                        if "annovar" not in param["annotation"]:
 2906                            param["annotation"]["annovar"] = {}
 2907
 2908                        if "annotations" not in param["annotation"]["annovar"]:
 2909                            param["annotation"]["annovar"]["annotations"] = {}
 2910
 2911                        # Options
 2912                        annotation_file_split = annotation_file.split(":")
 2913                        for annotation_file_annotation in annotation_file_split[1:]:
 2914                            if annotation_file_annotation:
 2915                                param["annotation"]["annovar"]["annotations"][
 2916                                    annotation_file_annotation
 2917                                ] = annotations
 2918
 2919                    # Annotation Exomiser
 2920                    elif annotation_file.startswith("exomiser"):
 2921
 2922                        log.debug(f"Quick Annotation Exomiser")
 2923
 2924                        param["annotation"]["exomiser"] = params_string_to_dict(
 2925                            annotation_file
 2926                        )
 2927
 2928                    # Annotation Splice
 2929                    elif annotation_file.startswith("splice"):
 2930
 2931                        log.debug(f"Quick Annotation Splice")
 2932
 2933                        param["annotation"]["splice"] = params_string_to_dict(
 2934                            annotation_file
 2935                        )
 2936
 2937                    # Annotation Parquet or BCFTOOLS
 2938                    else:
 2939
 2940                        # Tools detection
 2941                        if annotation_file.startswith("bcftools:"):
 2942                            annotation_tool_initial = "bcftools"
 2943                            annotation_file = ":".join(annotation_file.split(":")[1:])
 2944                        elif annotation_file.startswith("snpsift:"):
 2945                            annotation_tool_initial = "snpsift"
 2946                            annotation_file = ":".join(annotation_file.split(":")[1:])
 2947                        else:
 2948                            annotation_tool_initial = None
 2949
 2950                        # list of files
 2951                        annotation_file_list = annotation_file.replace("+", ":").split(
 2952                            ":"
 2953                        )
 2954
 2955                        for annotation_file in annotation_file_list:
 2956
 2957                            if annotation_file:
 2958
 2959                                # Annotation tool initial
 2960                                annotation_tool = annotation_tool_initial
 2961
 2962                                # Find file
 2963                                annotation_file_found = None
 2964
 2965                                # Expand user
 2966                                annotation_file = full_path(annotation_file)
 2967
 2968                                if os.path.exists(annotation_file):
 2969                                    annotation_file_found = annotation_file
 2970
 2971                                else:
 2972                                    # Find within assembly folders
 2973                                    for annotations_database in annotations_databases:
 2974                                        found_files = find_all(
 2975                                            annotation_file,
 2976                                            os.path.join(
 2977                                                annotations_database, assembly
 2978                                            ),
 2979                                        )
 2980                                        if len(found_files) > 0:
 2981                                            annotation_file_found = found_files[0]
 2982                                            break
 2983                                    if not annotation_file_found and not assembly:
 2984                                        # Find within folders
 2985                                        for (
 2986                                            annotations_database
 2987                                        ) in annotations_databases:
 2988                                            found_files = find_all(
 2989                                                annotation_file, annotations_database
 2990                                            )
 2991                                            if len(found_files) > 0:
 2992                                                annotation_file_found = found_files[0]
 2993                                                break
 2994                                log.debug(
 2995                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 2996                                )
 2997
 2998                                # Full path
 2999                                annotation_file_found = full_path(annotation_file_found)
 3000
 3001                                if annotation_file_found:
 3002
 3003                                    database = Database(database=annotation_file_found)
 3004                                    quick_annotation_format = database.get_format()
 3005                                    quick_annotation_is_compressed = (
 3006                                        database.is_compressed()
 3007                                    )
 3008                                    quick_annotation_is_indexed = os.path.exists(
 3009                                        f"{annotation_file_found}.tbi"
 3010                                    )
 3011                                    bcftools_preference = False
 3012
 3013                                    # Check Annotation Tool
 3014                                    if not annotation_tool:
 3015                                        if (
 3016                                            bcftools_preference
 3017                                            and quick_annotation_format
 3018                                            in ["vcf", "bed"]
 3019                                            and quick_annotation_is_compressed
 3020                                            and quick_annotation_is_indexed
 3021                                        ):
 3022                                            annotation_tool = "bcftools"
 3023                                        elif quick_annotation_format in [
 3024                                            "vcf",
 3025                                            "bed",
 3026                                            "tsv",
 3027                                            "tsv",
 3028                                            "csv",
 3029                                            "json",
 3030                                            "tbl",
 3031                                            "parquet",
 3032                                            "duckdb",
 3033                                        ]:
 3034                                            annotation_tool = "parquet"
 3035                                        else:
 3036                                            log.error(
 3037                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3038                                            )
 3039                                            raise ValueError(
 3040                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3041                                            )
 3042
 3043                                    log.debug(
 3044                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3045                                    )
 3046
 3047                                    # Annotation Tool dispatch
 3048                                    if annotation_tool:
 3049                                        if annotation_tool not in param["annotation"]:
 3050                                            param["annotation"][annotation_tool] = {}
 3051                                        if (
 3052                                            "annotations"
 3053                                            not in param["annotation"][annotation_tool]
 3054                                        ):
 3055                                            param["annotation"][annotation_tool][
 3056                                                "annotations"
 3057                                            ] = {}
 3058                                        param["annotation"][annotation_tool][
 3059                                            "annotations"
 3060                                        ][annotation_file_found] = annotations
 3061
 3062                                else:
 3063                                    log.error(
 3064                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3065                                    )
 3066
 3067                self.set_param(param)
 3068
 3069        if param.get("annotation", None):
 3070            log.info("Annotations")
 3071            if param.get("annotation", {}).get("parquet", None):
 3072                log.info("Annotations 'parquet'...")
 3073                self.annotation_parquet()
 3074            if param.get("annotation", {}).get("bcftools", None):
 3075                log.info("Annotations 'bcftools'...")
 3076                self.annotation_bcftools()
 3077            if param.get("annotation", {}).get("snpsift", None):
 3078                log.info("Annotations 'snpsift'...")
 3079                self.annotation_snpsift()
 3080            if param.get("annotation", {}).get("annovar", None):
 3081                log.info("Annotations 'annovar'...")
 3082                self.annotation_annovar()
 3083            if param.get("annotation", {}).get("snpeff", None):
 3084                log.info("Annotations 'snpeff'...")
 3085                self.annotation_snpeff()
 3086            if param.get("annotation", {}).get("exomiser", None) is not None:
 3087                log.info("Annotations 'exomiser'...")
 3088                self.annotation_exomiser()
 3089            if param.get("annotation", {}).get("splice", None) is not None:
 3090                log.info("Annotations 'splice' ...")
 3091                self.annotation_splice()
 3092
 3093        # Explode INFOS fields into table fields
 3094        if self.get_explode_infos():
 3095            self.explode_infos(
 3096                prefix=self.get_explode_infos_prefix(),
 3097                fields=self.get_explode_infos_fields(),
 3098                force=True,
 3099            )
 3100
 3101    def annotation_snpsift(self, threads: int = None) -> None:
 3102        """
 3103        This function annotate with bcftools
 3104
 3105        :param threads: Number of threads to use
 3106        :return: the value of the variable "return_value".
 3107        """
 3108
 3109        # DEBUG
 3110        log.debug("Start annotation with bcftools databases")
 3111
 3112        # Threads
 3113        if not threads:
 3114            threads = self.get_threads()
 3115        log.debug("Threads: " + str(threads))
 3116
 3117        # Config
 3118        config = self.get_config()
 3119        log.debug("Config: " + str(config))
 3120
 3121        # Config - snpSift
 3122        snpsift_bin_command = get_bin_command(
 3123            bin="SnpSift.jar",
 3124            tool="snpsift",
 3125            bin_type="jar",
 3126            config=config,
 3127            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3128        )
 3129        if not snpsift_bin_command:
 3130            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3131            log.error(msg_err)
 3132            raise ValueError(msg_err)
 3133
 3134        # Config - bcftools
 3135        bcftools_bin_command = get_bin_command(
 3136            bin="bcftools",
 3137            tool="bcftools",
 3138            bin_type="bin",
 3139            config=config,
 3140            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3141        )
 3142        if not bcftools_bin_command:
 3143            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3144            log.error(msg_err)
 3145            raise ValueError(msg_err)
 3146
 3147        # Config - BCFTools databases folders
 3148        databases_folders = set(
 3149            self.get_config()
 3150            .get("folders", {})
 3151            .get("databases", {})
 3152            .get("annotations", ["."])
 3153            + self.get_config()
 3154            .get("folders", {})
 3155            .get("databases", {})
 3156            .get("bcftools", ["."])
 3157        )
 3158        log.debug("Databases annotations: " + str(databases_folders))
 3159
 3160        # Param
 3161        annotations = (
 3162            self.get_param()
 3163            .get("annotation", {})
 3164            .get("snpsift", {})
 3165            .get("annotations", None)
 3166        )
 3167        log.debug("Annotations: " + str(annotations))
 3168
 3169        # Assembly
 3170        assembly = self.get_param().get(
 3171            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3172        )
 3173
 3174        # Data
 3175        table_variants = self.get_table_variants()
 3176
 3177        # Check if not empty
 3178        log.debug("Check if not empty")
 3179        sql_query_chromosomes = (
 3180            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3181        )
 3182        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3183        if not sql_query_chromosomes_df["count"][0]:
 3184            log.info(f"VCF empty")
 3185            return
 3186
 3187        # VCF header
 3188        vcf_reader = self.get_header()
 3189        log.debug("Initial header: " + str(vcf_reader.infos))
 3190
 3191        # Existing annotations
 3192        for vcf_annotation in self.get_header().infos:
 3193
 3194            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3195            log.debug(
 3196                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3197            )
 3198
 3199        if annotations:
 3200
 3201            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3202
 3203                # Export VCF file
 3204                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3205
 3206                # Init
 3207                commands = {}
 3208
 3209                for annotation in annotations:
 3210                    annotation_fields = annotations[annotation]
 3211
 3212                    # Annotation Name
 3213                    annotation_name = os.path.basename(annotation)
 3214
 3215                    if not annotation_fields:
 3216                        annotation_fields = {"INFO": None}
 3217
 3218                    log.debug(f"Annotation '{annotation_name}'")
 3219                    log.debug(
 3220                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3221                    )
 3222
 3223                    # Create Database
 3224                    database = Database(
 3225                        database=annotation,
 3226                        databases_folders=databases_folders,
 3227                        assembly=assembly,
 3228                    )
 3229
 3230                    # Find files
 3231                    db_file = database.get_database()
 3232                    db_file = full_path(db_file)
 3233                    db_hdr_file = database.get_header_file()
 3234                    db_hdr_file = full_path(db_hdr_file)
 3235                    db_file_type = database.get_format()
 3236                    db_tbi_file = f"{db_file}.tbi"
 3237                    db_file_compressed = database.is_compressed()
 3238
 3239                    # Check if compressed
 3240                    if not db_file_compressed:
 3241                        log.error(
 3242                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3243                        )
 3244                        raise ValueError(
 3245                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3246                        )
 3247
 3248                    # Check if indexed
 3249                    if not os.path.exists(db_tbi_file):
 3250                        log.error(
 3251                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3252                        )
 3253                        raise ValueError(
 3254                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3255                        )
 3256
 3257                    # Check index - try to create if not exists
 3258                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3259                        log.error("Annotation failed: database not valid")
 3260                        log.error(f"Annotation annotation file: {db_file}")
 3261                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3262                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3263                        raise ValueError(
 3264                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3265                        )
 3266                    else:
 3267
 3268                        log.debug(
 3269                            f"Annotation '{annotation}' - file: "
 3270                            + str(db_file)
 3271                            + " and "
 3272                            + str(db_hdr_file)
 3273                        )
 3274
 3275                        # Load header as VCF object
 3276                        db_hdr_vcf = Variants(input=db_hdr_file)
 3277                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3278                        log.debug(
 3279                            "Annotation database header: "
 3280                            + str(db_hdr_vcf_header_infos)
 3281                        )
 3282
 3283                        # For all fields in database
 3284                        annotation_fields_full = False
 3285                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3286                            annotation_fields = {
 3287                                key: key for key in db_hdr_vcf_header_infos
 3288                            }
 3289                            log.debug(
 3290                                "Annotation database header - All annotations added: "
 3291                                + str(annotation_fields)
 3292                            )
 3293                            annotation_fields_full = True
 3294
 3295                        # # Create file for field rename
 3296                        # log.debug("Create file for field rename")
 3297                        # tmp_rename = NamedTemporaryFile(
 3298                        #     prefix=self.get_prefix(),
 3299                        #     dir=self.get_tmp_dir(),
 3300                        #     suffix=".rename",
 3301                        #     delete=False,
 3302                        # )
 3303                        # tmp_rename_name = tmp_rename.name
 3304                        # tmp_files.append(tmp_rename_name)
 3305
 3306                        # Number of fields
 3307                        nb_annotation_field = 0
 3308                        annotation_list = []
 3309                        annotation_infos_rename_list = []
 3310
 3311                        for annotation_field in annotation_fields:
 3312
 3313                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3314                            annotation_fields_new_name = annotation_fields.get(
 3315                                annotation_field, annotation_field
 3316                            )
 3317                            if not annotation_fields_new_name:
 3318                                annotation_fields_new_name = annotation_field
 3319
 3320                            # Check if field is in DB and if field is not elready in input data
 3321                            if (
 3322                                annotation_field in db_hdr_vcf.get_header().infos
 3323                                and annotation_fields_new_name
 3324                                not in self.get_header().infos
 3325                            ):
 3326
 3327                                log.info(
 3328                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3329                                )
 3330
 3331                                # BCFTools annotate param to rename fields
 3332                                if annotation_field != annotation_fields_new_name:
 3333                                    annotation_infos_rename_list.append(
 3334                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3335                                    )
 3336
 3337                                # Add INFO field to header
 3338                                db_hdr_vcf_header_infos_number = (
 3339                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3340                                )
 3341                                db_hdr_vcf_header_infos_type = (
 3342                                    db_hdr_vcf_header_infos[annotation_field].type
 3343                                    or "String"
 3344                                )
 3345                                db_hdr_vcf_header_infos_description = (
 3346                                    db_hdr_vcf_header_infos[annotation_field].desc
 3347                                    or f"{annotation_field} description"
 3348                                )
 3349                                db_hdr_vcf_header_infos_source = (
 3350                                    db_hdr_vcf_header_infos[annotation_field].source
 3351                                    or "unknown"
 3352                                )
 3353                                db_hdr_vcf_header_infos_version = (
 3354                                    db_hdr_vcf_header_infos[annotation_field].version
 3355                                    or "unknown"
 3356                                )
 3357
 3358                                vcf_reader.infos[annotation_fields_new_name] = (
 3359                                    vcf.parser._Info(
 3360                                        annotation_fields_new_name,
 3361                                        db_hdr_vcf_header_infos_number,
 3362                                        db_hdr_vcf_header_infos_type,
 3363                                        db_hdr_vcf_header_infos_description,
 3364                                        db_hdr_vcf_header_infos_source,
 3365                                        db_hdr_vcf_header_infos_version,
 3366                                        self.code_type_map[
 3367                                            db_hdr_vcf_header_infos_type
 3368                                        ],
 3369                                    )
 3370                                )
 3371
 3372                                annotation_list.append(annotation_field)
 3373
 3374                                nb_annotation_field += 1
 3375
 3376                            else:
 3377
 3378                                if (
 3379                                    annotation_field
 3380                                    not in db_hdr_vcf.get_header().infos
 3381                                ):
 3382                                    log.warning(
 3383                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3384                                    )
 3385                                if (
 3386                                    annotation_fields_new_name
 3387                                    in self.get_header().infos
 3388                                ):
 3389                                    log.warning(
 3390                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3391                                    )
 3392
 3393                        log.info(
 3394                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3395                        )
 3396
 3397                        annotation_infos = ",".join(annotation_list)
 3398
 3399                        if annotation_infos != "":
 3400
 3401                            # Annotated VCF (and error file)
 3402                            tmp_annotation_vcf_name = os.path.join(
 3403                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3404                            )
 3405                            tmp_annotation_vcf_name_err = (
 3406                                tmp_annotation_vcf_name + ".err"
 3407                            )
 3408
 3409                            # Add fields to annotate
 3410                            if not annotation_fields_full:
 3411                                annotation_infos_option = f"-info {annotation_infos}"
 3412                            else:
 3413                                annotation_infos_option = ""
 3414
 3415                            # Info fields rename
 3416                            if annotation_infos_rename_list:
 3417                                annotation_infos_rename = " -c " + ",".join(
 3418                                    annotation_infos_rename_list
 3419                                )
 3420                            else:
 3421                                annotation_infos_rename = ""
 3422
 3423                            # Annotate command
 3424                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3425
 3426                            # Add command
 3427                            commands[command_annotate] = tmp_annotation_vcf_name
 3428
 3429                if commands:
 3430
 3431                    # Export VCF file
 3432                    self.export_variant_vcf(
 3433                        vcf_file=tmp_vcf_name,
 3434                        remove_info=True,
 3435                        add_samples=False,
 3436                        index=True,
 3437                    )
 3438                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3439
 3440                    # Num command
 3441                    nb_command = 0
 3442
 3443                    # Annotate
 3444                    for command_annotate in commands:
 3445                        nb_command += 1
 3446                        log.info(
 3447                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3448                        )
 3449                        log.debug(f"command_annotate={command_annotate}")
 3450                        run_parallel_commands([command_annotate], threads)
 3451
 3452                        # Debug
 3453                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3454
 3455                        # Update variants
 3456                        log.info(
 3457                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3458                        )
 3459                        self.update_from_vcf(commands[command_annotate])
 3460
 3461    def annotation_bcftools(self, threads: int = None) -> None:
 3462        """
 3463        This function annotate with bcftools
 3464
 3465        :param threads: Number of threads to use
 3466        :return: the value of the variable "return_value".
 3467        """
 3468
 3469        # DEBUG
 3470        log.debug("Start annotation with bcftools databases")
 3471
 3472        # Threads
 3473        if not threads:
 3474            threads = self.get_threads()
 3475        log.debug("Threads: " + str(threads))
 3476
 3477        # Config
 3478        config = self.get_config()
 3479        log.debug("Config: " + str(config))
 3480
 3481        # DEBUG
 3482        delete_tmp = True
 3483        if self.get_config().get("verbosity", "warning") in ["debug"]:
 3484            delete_tmp = False
 3485            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 3486
 3487        # Config - BCFTools bin command
 3488        bcftools_bin_command = get_bin_command(
 3489            bin="bcftools",
 3490            tool="bcftools",
 3491            bin_type="bin",
 3492            config=config,
 3493            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3494        )
 3495        if not bcftools_bin_command:
 3496            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3497            log.error(msg_err)
 3498            raise ValueError(msg_err)
 3499
 3500        # Config - BCFTools databases folders
 3501        databases_folders = set(
 3502            self.get_config()
 3503            .get("folders", {})
 3504            .get("databases", {})
 3505            .get("annotations", ["."])
 3506            + self.get_config()
 3507            .get("folders", {})
 3508            .get("databases", {})
 3509            .get("bcftools", ["."])
 3510        )
 3511        log.debug("Databases annotations: " + str(databases_folders))
 3512
 3513        # Param
 3514        annotations = (
 3515            self.get_param()
 3516            .get("annotation", {})
 3517            .get("bcftools", {})
 3518            .get("annotations", None)
 3519        )
 3520        log.debug("Annotations: " + str(annotations))
 3521
 3522        # Assembly
 3523        assembly = self.get_param().get(
 3524            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3525        )
 3526
 3527        # Data
 3528        table_variants = self.get_table_variants()
 3529
 3530        # Check if not empty
 3531        log.debug("Check if not empty")
 3532        sql_query_chromosomes = (
 3533            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3534        )
 3535        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3536        if not sql_query_chromosomes_df["count"][0]:
 3537            log.info(f"VCF empty")
 3538            return
 3539
 3540        # Export in VCF
 3541        log.debug("Create initial file to annotate")
 3542        tmp_vcf = NamedTemporaryFile(
 3543            prefix=self.get_prefix(),
 3544            dir=self.get_tmp_dir(),
 3545            suffix=".vcf.gz",
 3546            delete=False,
 3547        )
 3548        tmp_vcf_name = tmp_vcf.name
 3549
 3550        # VCF header
 3551        vcf_reader = self.get_header()
 3552        log.debug("Initial header: " + str(vcf_reader.infos))
 3553
 3554        # Existing annotations
 3555        for vcf_annotation in self.get_header().infos:
 3556
 3557            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3558            log.debug(
 3559                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3560            )
 3561
 3562        if annotations:
 3563
 3564            tmp_ann_vcf_list = []
 3565            commands = []
 3566            tmp_files = []
 3567            err_files = []
 3568
 3569            for annotation in annotations:
 3570                annotation_fields = annotations[annotation]
 3571
 3572                # Annotation Name
 3573                annotation_name = os.path.basename(annotation)
 3574
 3575                if not annotation_fields:
 3576                    annotation_fields = {"INFO": None}
 3577
 3578                log.debug(f"Annotation '{annotation_name}'")
 3579                log.debug(
 3580                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3581                )
 3582
 3583                # Create Database
 3584                database = Database(
 3585                    database=annotation,
 3586                    databases_folders=databases_folders,
 3587                    assembly=assembly,
 3588                )
 3589
 3590                # Find files
 3591                db_file = database.get_database()
 3592                db_file = full_path(db_file)
 3593                db_hdr_file = database.get_header_file()
 3594                db_hdr_file = full_path(db_hdr_file)
 3595                db_file_type = database.get_format()
 3596                db_tbi_file = f"{db_file}.tbi"
 3597                db_file_compressed = database.is_compressed()
 3598
 3599                # Check if compressed
 3600                if not db_file_compressed:
 3601                    log.error(
 3602                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3603                    )
 3604                    raise ValueError(
 3605                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3606                    )
 3607
 3608                # Check if indexed
 3609                if not os.path.exists(db_tbi_file):
 3610                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 3611                    raise ValueError(
 3612                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3613                    )
 3614
 3615                # Check index - try to create if not exists
 3616                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3617                    log.error("Annotation failed: database not valid")
 3618                    log.error(f"Annotation annotation file: {db_file}")
 3619                    log.error(f"Annotation annotation header: {db_hdr_file}")
 3620                    log.error(f"Annotation annotation index: {db_tbi_file}")
 3621                    raise ValueError(
 3622                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3623                    )
 3624                else:
 3625
 3626                    log.debug(
 3627                        f"Annotation '{annotation}' - file: "
 3628                        + str(db_file)
 3629                        + " and "
 3630                        + str(db_hdr_file)
 3631                    )
 3632
 3633                    # Load header as VCF object
 3634                    db_hdr_vcf = Variants(input=db_hdr_file)
 3635                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3636                    log.debug(
 3637                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 3638                    )
 3639
 3640                    # For all fields in database
 3641                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3642                        annotation_fields = {
 3643                            key: key for key in db_hdr_vcf_header_infos
 3644                        }
 3645                        log.debug(
 3646                            "Annotation database header - All annotations added: "
 3647                            + str(annotation_fields)
 3648                        )
 3649
 3650                    # Number of fields
 3651                    nb_annotation_field = 0
 3652                    annotation_list = []
 3653
 3654                    for annotation_field in annotation_fields:
 3655
 3656                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3657                        annotation_fields_new_name = annotation_fields.get(
 3658                            annotation_field, annotation_field
 3659                        )
 3660                        if not annotation_fields_new_name:
 3661                            annotation_fields_new_name = annotation_field
 3662
 3663                        # Check if field is in DB and if field is not elready in input data
 3664                        if (
 3665                            annotation_field in db_hdr_vcf.get_header().infos
 3666                            and annotation_fields_new_name
 3667                            not in self.get_header().infos
 3668                        ):
 3669
 3670                            log.info(
 3671                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3672                            )
 3673
 3674                            # Add INFO field to header
 3675                            db_hdr_vcf_header_infos_number = (
 3676                                db_hdr_vcf_header_infos[annotation_field].num or "."
 3677                            )
 3678                            db_hdr_vcf_header_infos_type = (
 3679                                db_hdr_vcf_header_infos[annotation_field].type
 3680                                or "String"
 3681                            )
 3682                            db_hdr_vcf_header_infos_description = (
 3683                                db_hdr_vcf_header_infos[annotation_field].desc
 3684                                or f"{annotation_field} description"
 3685                            )
 3686                            db_hdr_vcf_header_infos_source = (
 3687                                db_hdr_vcf_header_infos[annotation_field].source
 3688                                or "unknown"
 3689                            )
 3690                            db_hdr_vcf_header_infos_version = (
 3691                                db_hdr_vcf_header_infos[annotation_field].version
 3692                                or "unknown"
 3693                            )
 3694
 3695                            vcf_reader.infos[annotation_fields_new_name] = (
 3696                                vcf.parser._Info(
 3697                                    annotation_fields_new_name,
 3698                                    db_hdr_vcf_header_infos_number,
 3699                                    db_hdr_vcf_header_infos_type,
 3700                                    db_hdr_vcf_header_infos_description,
 3701                                    db_hdr_vcf_header_infos_source,
 3702                                    db_hdr_vcf_header_infos_version,
 3703                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 3704                                )
 3705                            )
 3706
 3707                            # annotation_list.append(annotation_field)
 3708                            if annotation_field != annotation_fields_new_name:
 3709                                annotation_list.append(
 3710                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3711                                )
 3712                            else:
 3713                                annotation_list.append(annotation_field)
 3714
 3715                            nb_annotation_field += 1
 3716
 3717                        else:
 3718
 3719                            if annotation_field not in db_hdr_vcf.get_header().infos:
 3720                                log.warning(
 3721                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 3722                                )
 3723                            if annotation_fields_new_name in self.get_header().infos:
 3724                                log.warning(
 3725                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3726                                )
 3727
 3728                    log.info(
 3729                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3730                    )
 3731
 3732                    annotation_infos = ",".join(annotation_list)
 3733
 3734                    if annotation_infos != "":
 3735
 3736                        # Protect header for bcftools (remove "#CHROM" and variants line)
 3737                        log.debug("Protect Header file - remove #CHROM line if exists")
 3738                        tmp_header_vcf = NamedTemporaryFile(
 3739                            prefix=self.get_prefix(),
 3740                            dir=self.get_tmp_dir(),
 3741                            suffix=".hdr",
 3742                            delete=False,
 3743                        )
 3744                        tmp_header_vcf_name = tmp_header_vcf.name
 3745                        tmp_files.append(tmp_header_vcf_name)
 3746                        # Command
 3747                        if db_hdr_file.endswith(".gz"):
 3748                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3749                        else:
 3750                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3751                        # Run
 3752                        run_parallel_commands([command_extract_header], 1)
 3753
 3754                        # Find chomosomes
 3755                        log.debug("Find chromosomes ")
 3756                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 3757                        sql_query_chromosomes_df = self.get_query_to_df(
 3758                            sql_query_chromosomes
 3759                        )
 3760                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 3761
 3762                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 3763
 3764                        # BED columns in the annotation file
 3765                        if db_file_type in ["bed"]:
 3766                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 3767
 3768                        for chrom in chomosomes_list:
 3769
 3770                            # Create BED on initial VCF
 3771                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 3772                            tmp_bed = NamedTemporaryFile(
 3773                                prefix=self.get_prefix(),
 3774                                dir=self.get_tmp_dir(),
 3775                                suffix=".bed",
 3776                                delete=False,
 3777                            )
 3778                            tmp_bed_name = tmp_bed.name
 3779                            tmp_files.append(tmp_bed_name)
 3780
 3781                            # Detecte regions
 3782                            log.debug(
 3783                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 3784                            )
 3785                            window = 1000000
 3786                            sql_query_intervals_for_bed = f"""
 3787                                SELECT  \"#CHROM\",
 3788                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 3789                                        \"POS\"+{window}
 3790                                FROM {table_variants} as table_variants
 3791                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 3792                            """
 3793                            regions = self.conn.execute(
 3794                                sql_query_intervals_for_bed
 3795                            ).fetchall()
 3796                            merged_regions = merge_regions(regions)
 3797                            log.debug(
 3798                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 3799                            )
 3800
 3801                            header = ["#CHROM", "START", "END"]
 3802                            with open(tmp_bed_name, "w") as f:
 3803                                # Write the header with tab delimiter
 3804                                f.write("\t".join(header) + "\n")
 3805                                for d in merged_regions:
 3806                                    # Write each data row with tab delimiter
 3807                                    f.write("\t".join(map(str, d)) + "\n")
 3808
 3809                            # Tmp files
 3810                            tmp_annotation_vcf = NamedTemporaryFile(
 3811                                prefix=self.get_prefix(),
 3812                                dir=self.get_tmp_dir(),
 3813                                suffix=".vcf.gz",
 3814                                delete=False,
 3815                            )
 3816                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 3817                            tmp_files.append(tmp_annotation_vcf_name)
 3818                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 3819                            tmp_annotation_vcf_name_err = (
 3820                                tmp_annotation_vcf_name + ".err"
 3821                            )
 3822                            err_files.append(tmp_annotation_vcf_name_err)
 3823
 3824                            # Annotate Command
 3825                            log.debug(
 3826                                f"Annotation '{annotation}' - add bcftools command"
 3827                            )
 3828
 3829                            # Command
 3830                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3831
 3832                            # Add command
 3833                            commands.append(command_annotate)
 3834
 3835            # if some commands
 3836            if commands:
 3837
 3838                # Export VCF file
 3839                self.export_variant_vcf(
 3840                    vcf_file=tmp_vcf_name,
 3841                    remove_info=True,
 3842                    add_samples=False,
 3843                    index=True,
 3844                )
 3845
 3846                # Threads
 3847                # calculate threads for annotated commands
 3848                if commands:
 3849                    threads_bcftools_annotate = round(threads / len(commands))
 3850                else:
 3851                    threads_bcftools_annotate = 1
 3852
 3853                if not threads_bcftools_annotate:
 3854                    threads_bcftools_annotate = 1
 3855
 3856                # Add threads option to bcftools commands
 3857                if threads_bcftools_annotate > 1:
 3858                    commands_threaded = []
 3859                    for command in commands:
 3860                        commands_threaded.append(
 3861                            command.replace(
 3862                                f"{bcftools_bin_command} annotate ",
 3863                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 3864                            )
 3865                        )
 3866                    commands = commands_threaded
 3867
 3868                # Command annotation multithreading
 3869                log.debug(f"Annotation - Annotation commands: " + str(commands))
 3870                log.info(
 3871                    f"Annotation - Annotation multithreaded in "
 3872                    + str(len(commands))
 3873                    + " commands"
 3874                )
 3875
 3876                run_parallel_commands(commands, threads)
 3877
 3878                # Merge
 3879                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 3880
 3881                if tmp_ann_vcf_list_cmd:
 3882
 3883                    # Tmp file
 3884                    tmp_annotate_vcf = NamedTemporaryFile(
 3885                        prefix=self.get_prefix(),
 3886                        dir=self.get_tmp_dir(),
 3887                        suffix=".vcf.gz",
 3888                        delete=True,
 3889                    )
 3890                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 3891                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 3892                    err_files.append(tmp_annotate_vcf_name_err)
 3893
 3894                    # Tmp file remove command
 3895                    tmp_files_remove_command = ""
 3896                    if tmp_files:
 3897                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 3898
 3899                    # Command merge
 3900                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 3901                    log.info(
 3902                        f"Annotation - Annotation merging "
 3903                        + str(len(commands))
 3904                        + " annotated files"
 3905                    )
 3906                    log.debug(f"Annotation - merge command: {merge_command}")
 3907                    run_parallel_commands([merge_command], 1)
 3908
 3909                    # Error messages
 3910                    log.info(f"Error/Warning messages:")
 3911                    error_message_command_all = []
 3912                    error_message_command_warning = []
 3913                    error_message_command_err = []
 3914                    for err_file in err_files:
 3915                        with open(err_file, "r") as f:
 3916                            for line in f:
 3917                                message = line.strip()
 3918                                error_message_command_all.append(message)
 3919                                if line.startswith("[W::"):
 3920                                    error_message_command_warning.append(message)
 3921                                if line.startswith("[E::"):
 3922                                    error_message_command_err.append(
 3923                                        f"{err_file}: " + message
 3924                                    )
 3925                    # log info
 3926                    for message in list(
 3927                        set(error_message_command_err + error_message_command_warning)
 3928                    ):
 3929                        log.info(f"   {message}")
 3930                    # debug info
 3931                    for message in list(set(error_message_command_all)):
 3932                        log.debug(f"   {message}")
 3933                    # failed
 3934                    if len(error_message_command_err):
 3935                        log.error("Annotation failed: Error in commands")
 3936                        raise ValueError("Annotation failed: Error in commands")
 3937
 3938                    # Update variants
 3939                    log.info(f"Annotation - Updating...")
 3940                    self.update_from_vcf(tmp_annotate_vcf_name)
 3941
 3942    def annotation_exomiser(self, threads: int = None) -> None:
 3943        """
 3944        This function annotate with Exomiser
 3945
 3946        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 3947        - "analysis" (dict/file):
 3948            Full analysis dictionnary parameters (see Exomiser docs).
 3949            Either a dict, or a file in JSON or YAML format.
 3950            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 3951            Default : None
 3952        - "preset" (string):
 3953            Analysis preset (available in config folder).
 3954            Used if no full "analysis" is provided.
 3955            Default: "exome"
 3956        - "phenopacket" (dict/file):
 3957            Samples and phenotipic features parameters (see Exomiser docs).
 3958            Either a dict, or a file in JSON or YAML format.
 3959            Default: None
 3960        - "subject" (dict):
 3961            Sample parameters (see Exomiser docs).
 3962            Example:
 3963                "subject":
 3964                    {
 3965                        "id": "ISDBM322017",
 3966                        "sex": "FEMALE"
 3967                    }
 3968            Default: None
 3969        - "sample" (string):
 3970            Sample name to construct "subject" section:
 3971                "subject":
 3972                    {
 3973                        "id": "<sample>",
 3974                        "sex": "UNKNOWN_SEX"
 3975                    }
 3976            Default: None
 3977        - "phenotypicFeatures" (dict)
 3978            Phenotypic features to construct "subject" section.
 3979            Example:
 3980                "phenotypicFeatures":
 3981                    [
 3982                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 3983                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 3984                    ]
 3985        - "hpo" (list)
 3986            List of HPO ids as phenotypic features.
 3987            Example:
 3988                "hpo": ['0001156', '0001363', '0011304', '0010055']
 3989            Default: []
 3990        - "outputOptions" (dict):
 3991            Output options (see Exomiser docs).
 3992            Default:
 3993                "output_options" =
 3994                    {
 3995                        "outputContributingVariantsOnly": False,
 3996                        "numGenes": 0,
 3997                        "outputFormats": ["TSV_VARIANT", "VCF"]
 3998                    }
 3999        - "transcript_source" (string):
 4000            Transcript source (either "refseq", "ucsc", "ensembl")
 4001            Default: "refseq"
 4002        - "exomiser_to_info" (boolean):
 4003            Add exomiser TSV file columns as INFO fields in VCF.
 4004            Default: False
 4005        - "release" (string):
 4006            Exomise database release.
 4007            If not exists, database release will be downloaded (take a while).
 4008            Default: None (provided by application.properties configuration file)
 4009        - "exomiser_application_properties" (file):
 4010            Exomiser configuration file (see Exomiser docs).
 4011            Useful to automatically download databases (especially for specific genome databases).
 4012
 4013        Notes:
 4014        - If no sample in parameters, first sample in VCF will be chosen
 4015        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4016
 4017        :param threads: The number of threads to use
 4018        :return: None.
 4019        """
 4020
 4021        # DEBUG
 4022        log.debug("Start annotation with Exomiser databases")
 4023
 4024        # Threads
 4025        if not threads:
 4026            threads = self.get_threads()
 4027        log.debug("Threads: " + str(threads))
 4028
 4029        # Config
 4030        config = self.get_config()
 4031        log.debug("Config: " + str(config))
 4032
 4033        # Config - Folders - Databases
 4034        databases_folders = (
 4035            config.get("folders", {})
 4036            .get("databases", {})
 4037            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4038        )
 4039        databases_folders = full_path(databases_folders)
 4040        if not os.path.exists(databases_folders):
 4041            log.error(f"Databases annotations: {databases_folders} NOT found")
 4042        log.debug("Databases annotations: " + str(databases_folders))
 4043
 4044        # Config - Exomiser
 4045        exomiser_bin_command = get_bin_command(
 4046            bin="exomiser-cli*.jar",
 4047            tool="exomiser",
 4048            bin_type="jar",
 4049            config=config,
 4050            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4051        )
 4052        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4053        if not exomiser_bin_command:
 4054            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4055            log.error(msg_err)
 4056            raise ValueError(msg_err)
 4057
 4058        # Param
 4059        param = self.get_param()
 4060        log.debug("Param: " + str(param))
 4061
 4062        # Param - Exomiser
 4063        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4064        log.debug(f"Param Exomiser: {param_exomiser}")
 4065
 4066        # Param - Assembly
 4067        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4068        log.debug("Assembly: " + str(assembly))
 4069
 4070        # Data
 4071        table_variants = self.get_table_variants()
 4072
 4073        # Check if not empty
 4074        log.debug("Check if not empty")
 4075        sql_query_chromosomes = (
 4076            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4077        )
 4078        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4079            log.info(f"VCF empty")
 4080            return False
 4081
 4082        # VCF header
 4083        vcf_reader = self.get_header()
 4084        log.debug("Initial header: " + str(vcf_reader.infos))
 4085
 4086        # Samples
 4087        samples = self.get_header_sample_list()
 4088        if not samples:
 4089            log.error("No Samples in VCF")
 4090            return False
 4091        log.debug(f"Samples: {samples}")
 4092
 4093        # Memory limit
 4094        memory_limit = self.get_memory("8G")
 4095        log.debug(f"memory_limit: {memory_limit}")
 4096
 4097        # Exomiser java options
 4098        exomiser_java_options = (
 4099            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4100        )
 4101        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4102
 4103        # Download Exomiser (if not exists)
 4104        exomiser_release = param_exomiser.get("release", None)
 4105        exomiser_application_properties = param_exomiser.get(
 4106            "exomiser_application_properties", None
 4107        )
 4108        databases_download_exomiser(
 4109            assemblies=[assembly],
 4110            exomiser_folder=databases_folders,
 4111            exomiser_release=exomiser_release,
 4112            exomiser_phenotype_release=exomiser_release,
 4113            exomiser_application_properties=exomiser_application_properties,
 4114        )
 4115
 4116        # Force annotation
 4117        force_update_annotation = True
 4118
 4119        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4120            log.debug("Start annotation Exomiser")
 4121
 4122            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4123
 4124                # tmp_dir = "/tmp/exomiser"
 4125
 4126                ### ANALYSIS ###
 4127                ################
 4128
 4129                # Create analysis.json through analysis dict
 4130                # either analysis in param or by default
 4131                # depending on preset exome/genome)
 4132
 4133                # Init analysis dict
 4134                param_exomiser_analysis_dict = {}
 4135
 4136                # analysis from param
 4137                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4138                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4139
 4140                # If analysis in param -> load anlaysis json
 4141                if param_exomiser_analysis:
 4142
 4143                    # If param analysis is a file and exists
 4144                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4145                        param_exomiser_analysis
 4146                    ):
 4147                        # Load analysis file into analysis dict (either yaml or json)
 4148                        with open(param_exomiser_analysis) as json_file:
 4149                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4150
 4151                    # If param analysis is a dict
 4152                    elif isinstance(param_exomiser_analysis, dict):
 4153                        # Load analysis dict into analysis dict (either yaml or json)
 4154                        param_exomiser_analysis_dict = param_exomiser_analysis
 4155
 4156                    # Error analysis type
 4157                    else:
 4158                        log.error(f"Analysis type unknown. Check param file.")
 4159                        raise ValueError(f"Analysis type unknown. Check param file.")
 4160
 4161                # Case no input analysis config file/dict
 4162                # Use preset (exome/genome) to open default config file
 4163                if not param_exomiser_analysis_dict:
 4164
 4165                    # default preset
 4166                    default_preset = "exome"
 4167
 4168                    # Get param preset or default preset
 4169                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4170
 4171                    # Try to find if preset is a file
 4172                    if os.path.exists(param_exomiser_preset):
 4173                        # Preset file is provided in full path
 4174                        param_exomiser_analysis_default_config_file = (
 4175                            param_exomiser_preset
 4176                        )
 4177                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4178                    #     # Preset file is provided in full path
 4179                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4180                    elif os.path.exists(
 4181                        os.path.join(folder_config, param_exomiser_preset)
 4182                    ):
 4183                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4184                        param_exomiser_analysis_default_config_file = os.path.join(
 4185                            folder_config, param_exomiser_preset
 4186                        )
 4187                    else:
 4188                        # Construct preset file
 4189                        param_exomiser_analysis_default_config_file = os.path.join(
 4190                            folder_config,
 4191                            f"preset-{param_exomiser_preset}-analysis.json",
 4192                        )
 4193
 4194                    # If preset file exists
 4195                    param_exomiser_analysis_default_config_file = full_path(
 4196                        param_exomiser_analysis_default_config_file
 4197                    )
 4198                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4199                        # Load prest file into analysis dict (either yaml or json)
 4200                        with open(
 4201                            param_exomiser_analysis_default_config_file
 4202                        ) as json_file:
 4203                            # param_exomiser_analysis_dict[""] = json.load(json_file)
 4204                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4205                                json_file
 4206                            )
 4207
 4208                    # Error preset file
 4209                    else:
 4210                        log.error(
 4211                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4212                        )
 4213                        raise ValueError(
 4214                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4215                        )
 4216
 4217                # If no analysis dict created
 4218                if not param_exomiser_analysis_dict:
 4219                    log.error(f"No analysis config")
 4220                    raise ValueError(f"No analysis config")
 4221
 4222                # Log
 4223                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4224
 4225                ### PHENOPACKET ###
 4226                ###################
 4227
 4228                # If no PhenoPacket in analysis dict -> check in param
 4229                if "phenopacket" not in param_exomiser_analysis_dict:
 4230
 4231                    # If PhenoPacket in param -> load anlaysis json
 4232                    if param_exomiser.get("phenopacket", None):
 4233
 4234                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4235                        param_exomiser_phenopacket = full_path(
 4236                            param_exomiser_phenopacket
 4237                        )
 4238
 4239                        # If param phenopacket is a file and exists
 4240                        if isinstance(
 4241                            param_exomiser_phenopacket, str
 4242                        ) and os.path.exists(param_exomiser_phenopacket):
 4243                            # Load phenopacket file into analysis dict (either yaml or json)
 4244                            with open(param_exomiser_phenopacket) as json_file:
 4245                                param_exomiser_analysis_dict["phenopacket"] = (
 4246                                    yaml.safe_load(json_file)
 4247                                )
 4248
 4249                        # If param phenopacket is a dict
 4250                        elif isinstance(param_exomiser_phenopacket, dict):
 4251                            # Load phenopacket dict into analysis dict (either yaml or json)
 4252                            param_exomiser_analysis_dict["phenopacket"] = (
 4253                                param_exomiser_phenopacket
 4254                            )
 4255
 4256                        # Error phenopacket type
 4257                        else:
 4258                            log.error(f"Phenopacket type unknown. Check param file.")
 4259                            raise ValueError(
 4260                                f"Phenopacket type unknown. Check param file."
 4261                            )
 4262
 4263                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4264                if "phenopacket" not in param_exomiser_analysis_dict:
 4265
 4266                    # Init PhenoPacket
 4267                    param_exomiser_analysis_dict["phenopacket"] = {
 4268                        "id": "analysis",
 4269                        "proband": {},
 4270                    }
 4271
 4272                    ### Add subject ###
 4273
 4274                    # If subject exists
 4275                    param_exomiser_subject = param_exomiser.get("subject", {})
 4276
 4277                    # If subject not exists -> found sample ID
 4278                    if not param_exomiser_subject:
 4279
 4280                        # Found sample ID in param
 4281                        sample = param_exomiser.get("sample", None)
 4282
 4283                        # Find sample ID (first sample)
 4284                        if not sample:
 4285                            sample_list = self.get_header_sample_list()
 4286                            if len(sample_list) > 0:
 4287                                sample = sample_list[0]
 4288                            else:
 4289                                log.error(f"No sample found")
 4290                                raise ValueError(f"No sample found")
 4291
 4292                        # Create subject
 4293                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4294
 4295                    # Add to dict
 4296                    param_exomiser_analysis_dict["phenopacket"][
 4297                        "subject"
 4298                    ] = param_exomiser_subject
 4299
 4300                    ### Add "phenotypicFeatures" ###
 4301
 4302                    # If phenotypicFeatures exists
 4303                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4304                        "phenotypicFeatures", []
 4305                    )
 4306
 4307                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4308                    if not param_exomiser_phenotypicfeatures:
 4309
 4310                        # Found HPO in param
 4311                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4312
 4313                        # Split HPO if list in string format separated by comma
 4314                        if isinstance(param_exomiser_hpo, str):
 4315                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4316
 4317                        # Create HPO list
 4318                        for hpo in param_exomiser_hpo:
 4319                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4320                            param_exomiser_phenotypicfeatures.append(
 4321                                {
 4322                                    "type": {
 4323                                        "id": f"HP:{hpo_clean}",
 4324                                        "label": f"HP:{hpo_clean}",
 4325                                    }
 4326                                }
 4327                            )
 4328
 4329                    # Add to dict
 4330                    param_exomiser_analysis_dict["phenopacket"][
 4331                        "phenotypicFeatures"
 4332                    ] = param_exomiser_phenotypicfeatures
 4333
 4334                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4335                    if not param_exomiser_phenotypicfeatures:
 4336                        for step in param_exomiser_analysis_dict.get(
 4337                            "analysis", {}
 4338                        ).get("steps", []):
 4339                            if "hiPhivePrioritiser" in step:
 4340                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4341                                    "steps", []
 4342                                ).remove(step)
 4343
 4344                ### Add Input File ###
 4345
 4346                # Initial file name and htsFiles
 4347                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4348                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4349                    {
 4350                        "uri": tmp_vcf_name,
 4351                        "htsFormat": "VCF",
 4352                        "genomeAssembly": assembly,
 4353                    }
 4354                ]
 4355
 4356                ### Add metaData ###
 4357
 4358                # If metaData not in analysis dict
 4359                if "metaData" not in param_exomiser_analysis_dict:
 4360                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4361                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4362                        "createdBy": "howard",
 4363                        "phenopacketSchemaVersion": 1,
 4364                    }
 4365
 4366                ### OutputOptions ###
 4367
 4368                # Init output result folder
 4369                output_results = os.path.join(tmp_dir, "results")
 4370
 4371                # If no outputOptions in analysis dict
 4372                if "outputOptions" not in param_exomiser_analysis_dict:
 4373
 4374                    # default output formats
 4375                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4376
 4377                    # Get outputOptions in param
 4378                    output_options = param_exomiser.get("outputOptions", None)
 4379
 4380                    # If no output_options in param -> check
 4381                    if not output_options:
 4382                        output_options = {
 4383                            "outputContributingVariantsOnly": False,
 4384                            "numGenes": 0,
 4385                            "outputFormats": defaut_output_formats,
 4386                        }
 4387
 4388                    # Replace outputDirectory in output options
 4389                    output_options["outputDirectory"] = output_results
 4390                    output_options["outputFileName"] = "howard"
 4391
 4392                    # Add outputOptions in analysis dict
 4393                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4394
 4395                else:
 4396
 4397                    # Replace output_results and output format (if exists in param)
 4398                    param_exomiser_analysis_dict["outputOptions"][
 4399                        "outputDirectory"
 4400                    ] = output_results
 4401                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4402                        list(
 4403                            set(
 4404                                param_exomiser_analysis_dict.get(
 4405                                    "outputOptions", {}
 4406                                ).get("outputFormats", [])
 4407                                + ["TSV_VARIANT", "VCF"]
 4408                            )
 4409                        )
 4410                    )
 4411
 4412                # log
 4413                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4414
 4415                ### ANALYSIS FILE ###
 4416                #####################
 4417
 4418                ### Full JSON analysis config file ###
 4419
 4420                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4421                with open(exomiser_analysis, "w") as fp:
 4422                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4423
 4424                ### SPLIT analysis and sample config files
 4425
 4426                # Splitted analysis dict
 4427                param_exomiser_analysis_dict_for_split = (
 4428                    param_exomiser_analysis_dict.copy()
 4429                )
 4430
 4431                # Phenopacket JSON file
 4432                exomiser_analysis_phenopacket = os.path.join(
 4433                    tmp_dir, "analysis_phenopacket.json"
 4434                )
 4435                with open(exomiser_analysis_phenopacket, "w") as fp:
 4436                    json.dump(
 4437                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4438                        fp,
 4439                        indent=4,
 4440                    )
 4441
 4442                # Analysis JSON file without Phenopacket parameters
 4443                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4444                exomiser_analysis_analysis = os.path.join(
 4445                    tmp_dir, "analysis_analysis.json"
 4446                )
 4447                with open(exomiser_analysis_analysis, "w") as fp:
 4448                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4449
 4450                ### INITAL VCF file ###
 4451                #######################
 4452
 4453                ### Create list of samples to use and include inti initial VCF file ####
 4454
 4455                # Subject (main sample)
 4456                # Get sample ID in analysis dict
 4457                sample_subject = (
 4458                    param_exomiser_analysis_dict.get("phenopacket", {})
 4459                    .get("subject", {})
 4460                    .get("id", None)
 4461                )
 4462                sample_proband = (
 4463                    param_exomiser_analysis_dict.get("phenopacket", {})
 4464                    .get("proband", {})
 4465                    .get("subject", {})
 4466                    .get("id", None)
 4467                )
 4468                sample = []
 4469                if sample_subject:
 4470                    sample.append(sample_subject)
 4471                if sample_proband:
 4472                    sample.append(sample_proband)
 4473
 4474                # Get sample ID within Pedigree
 4475                pedigree_persons_list = (
 4476                    param_exomiser_analysis_dict.get("phenopacket", {})
 4477                    .get("pedigree", {})
 4478                    .get("persons", {})
 4479                )
 4480
 4481                # Create list with all sample ID in pedigree (if exists)
 4482                pedigree_persons = []
 4483                for person in pedigree_persons_list:
 4484                    pedigree_persons.append(person.get("individualId"))
 4485
 4486                # Concat subject sample ID and samples ID in pedigreesamples
 4487                samples = list(set(sample + pedigree_persons))
 4488
 4489                # Check if sample list is not empty
 4490                if not samples:
 4491                    log.error(f"No samples found")
 4492                    raise ValueError(f"No samples found")
 4493
 4494                # Create VCF with sample (either sample in param or first one by default)
 4495                # Export VCF file
 4496                self.export_variant_vcf(
 4497                    vcf_file=tmp_vcf_name,
 4498                    remove_info=True,
 4499                    add_samples=True,
 4500                    list_samples=samples,
 4501                    index=False,
 4502                )
 4503
 4504                ### Execute Exomiser ###
 4505                ########################
 4506
 4507                # Init command
 4508                exomiser_command = ""
 4509
 4510                # Command exomiser options
 4511                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 4512
 4513                # Release
 4514                exomiser_release = param_exomiser.get("release", None)
 4515                if exomiser_release:
 4516                    # phenotype data version
 4517                    exomiser_options += (
 4518                        f" --exomiser.phenotype.data-version={exomiser_release} "
 4519                    )
 4520                    # data version
 4521                    exomiser_options += (
 4522                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 4523                    )
 4524                    # variant white list
 4525                    variant_white_list_file = (
 4526                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 4527                    )
 4528                    if os.path.exists(
 4529                        os.path.join(
 4530                            databases_folders, assembly, variant_white_list_file
 4531                        )
 4532                    ):
 4533                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 4534
 4535                # transcript_source
 4536                transcript_source = param_exomiser.get(
 4537                    "transcript_source", None
 4538                )  # ucsc, refseq, ensembl
 4539                if transcript_source:
 4540                    exomiser_options += (
 4541                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 4542                    )
 4543
 4544                # If analysis contain proband param
 4545                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 4546                    "proband", {}
 4547                ):
 4548                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 4549
 4550                # If no proband (usually uniq sample)
 4551                else:
 4552                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 4553
 4554                # Log
 4555                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 4556
 4557                # Run command
 4558                result = subprocess.call(
 4559                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 4560                )
 4561                if result:
 4562                    log.error("Exomiser command failed")
 4563                    raise ValueError("Exomiser command failed")
 4564
 4565                ### RESULTS ###
 4566                ###############
 4567
 4568                ### Annotate with TSV fields ###
 4569
 4570                # Init result tsv file
 4571                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 4572
 4573                # Init result tsv file
 4574                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 4575
 4576                # Parse TSV file and explode columns in INFO field
 4577                if exomiser_to_info and os.path.exists(output_results_tsv):
 4578
 4579                    # Log
 4580                    log.debug("Exomiser columns to VCF INFO field")
 4581
 4582                    # Retrieve columns and types
 4583                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 4584                    output_results_tsv_df = self.get_query_to_df(query)
 4585                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 4586
 4587                    # Init concat fields for update
 4588                    sql_query_update_concat_fields = []
 4589
 4590                    # Fields to avoid
 4591                    fields_to_avoid = [
 4592                        "CONTIG",
 4593                        "START",
 4594                        "END",
 4595                        "REF",
 4596                        "ALT",
 4597                        "QUAL",
 4598                        "FILTER",
 4599                        "GENOTYPE",
 4600                    ]
 4601
 4602                    # List all columns to add into header
 4603                    for header_column in output_results_tsv_columns:
 4604
 4605                        # If header column is enable
 4606                        if header_column not in fields_to_avoid:
 4607
 4608                            # Header info type
 4609                            header_info_type = "String"
 4610                            header_column_df = output_results_tsv_df[header_column]
 4611                            header_column_df_dtype = header_column_df.dtype
 4612                            if header_column_df_dtype == object:
 4613                                if (
 4614                                    pd.to_numeric(header_column_df, errors="coerce")
 4615                                    .notnull()
 4616                                    .all()
 4617                                ):
 4618                                    header_info_type = "Float"
 4619                            else:
 4620                                header_info_type = "Integer"
 4621
 4622                            # Header info
 4623                            characters_to_validate = ["-"]
 4624                            pattern = "[" + "".join(characters_to_validate) + "]"
 4625                            header_info_name = re.sub(
 4626                                pattern,
 4627                                "_",
 4628                                f"Exomiser_{header_column}".replace("#", ""),
 4629                            )
 4630                            header_info_number = "."
 4631                            header_info_description = (
 4632                                f"Exomiser {header_column} annotation"
 4633                            )
 4634                            header_info_source = "Exomiser"
 4635                            header_info_version = "unknown"
 4636                            header_info_code = CODE_TYPE_MAP[header_info_type]
 4637                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 4638                                header_info_name,
 4639                                header_info_number,
 4640                                header_info_type,
 4641                                header_info_description,
 4642                                header_info_source,
 4643                                header_info_version,
 4644                                header_info_code,
 4645                            )
 4646
 4647                            # Add field to add for update to concat fields
 4648                            sql_query_update_concat_fields.append(
 4649                                f"""
 4650                                CASE
 4651                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 4652                                    THEN concat(
 4653                                        '{header_info_name}=',
 4654                                        table_parquet."{header_column}",
 4655                                        ';'
 4656                                        )
 4657
 4658                                    ELSE ''
 4659                                END
 4660                            """
 4661                            )
 4662
 4663                    # Update query
 4664                    sql_query_update = f"""
 4665                        UPDATE {table_variants} as table_variants
 4666                            SET INFO = concat(
 4667                                            CASE
 4668                                                WHEN INFO NOT IN ('', '.')
 4669                                                THEN INFO
 4670                                                ELSE ''
 4671                                            END,
 4672                                            CASE
 4673                                                WHEN table_variants.INFO NOT IN ('','.')
 4674                                                THEN ';'
 4675                                                ELSE ''
 4676                                            END,
 4677                                            (
 4678                                            SELECT 
 4679                                                concat(
 4680                                                    {",".join(sql_query_update_concat_fields)}
 4681                                                )
 4682                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 4683                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 4684                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 4685                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 4686                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 4687                                            )
 4688                                        )
 4689                            ;
 4690                        """
 4691
 4692                    # Update
 4693                    self.conn.execute(sql_query_update)
 4694
 4695                ### Annotate with VCF INFO field ###
 4696
 4697                # Init result VCF file
 4698                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 4699
 4700                # If VCF exists
 4701                if os.path.exists(output_results_vcf):
 4702
 4703                    # Log
 4704                    log.debug("Exomiser result VCF update variants")
 4705
 4706                    # Find Exomiser INFO field annotation in header
 4707                    with gzip.open(output_results_vcf, "rt") as f:
 4708                        header_list = self.read_vcf_header(f)
 4709                    exomiser_vcf_header = vcf.Reader(
 4710                        io.StringIO("\n".join(header_list))
 4711                    )
 4712
 4713                    # Add annotation INFO field to header
 4714                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 4715
 4716                    # Update variants with VCF
 4717                    self.update_from_vcf(output_results_vcf)
 4718
 4719        return True
 4720
 4721    def annotation_snpeff(self, threads: int = None) -> None:
 4722        """
 4723        This function annotate with snpEff
 4724
 4725        :param threads: The number of threads to use
 4726        :return: the value of the variable "return_value".
 4727        """
 4728
 4729        # DEBUG
 4730        log.debug("Start annotation with snpeff databases")
 4731
 4732        # Threads
 4733        if not threads:
 4734            threads = self.get_threads()
 4735        log.debug("Threads: " + str(threads))
 4736
 4737        # DEBUG
 4738        delete_tmp = True
 4739        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4740            delete_tmp = False
 4741            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4742
 4743        # Config
 4744        config = self.get_config()
 4745        log.debug("Config: " + str(config))
 4746
 4747        # Config - Folders - Databases
 4748        databases_folders = (
 4749            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 4750        )
 4751        log.debug("Databases annotations: " + str(databases_folders))
 4752
 4753        # # Config - Java
 4754        # java_bin = get_bin(
 4755        #     tool="java",
 4756        #     bin="java",
 4757        #     bin_type="bin",
 4758        #     config=config,
 4759        #     default_folder="/usr/bin",
 4760        # )
 4761        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
 4762        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
 4763        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
 4764
 4765        # # Config - snpEff bin
 4766        # snpeff_jar = get_bin(
 4767        #     tool="snpeff",
 4768        #     bin="snpEff.jar",
 4769        #     bin_type="jar",
 4770        #     config=config,
 4771        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4772        # )
 4773        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
 4774        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4775        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4776
 4777        # Config - snpEff bin command
 4778        snpeff_bin_command = get_bin_command(
 4779            bin="snpEff.jar",
 4780            tool="snpeff",
 4781            bin_type="jar",
 4782            config=config,
 4783            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4784        )
 4785        if not snpeff_bin_command:
 4786            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 4787            log.error(msg_err)
 4788            raise ValueError(msg_err)
 4789
 4790        # Config - snpEff databases
 4791        snpeff_databases = (
 4792            config.get("folders", {})
 4793            .get("databases", {})
 4794            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 4795        )
 4796        snpeff_databases = full_path(snpeff_databases)
 4797        if snpeff_databases is not None and snpeff_databases != "":
 4798            log.debug(f"Create snpEff databases folder")
 4799            if not os.path.exists(snpeff_databases):
 4800                os.makedirs(snpeff_databases)
 4801
 4802        # Param
 4803        param = self.get_param()
 4804        log.debug("Param: " + str(param))
 4805
 4806        # Param
 4807        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 4808        log.debug("Options: " + str(options))
 4809
 4810        # Param - Assembly
 4811        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4812
 4813        # Param - Options
 4814        snpeff_options = (
 4815            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 4816        )
 4817        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 4818        snpeff_csvstats = (
 4819            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 4820        )
 4821        if snpeff_stats:
 4822            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 4823            snpeff_stats = full_path(snpeff_stats)
 4824            snpeff_options += f" -stats {snpeff_stats}"
 4825        if snpeff_csvstats:
 4826            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 4827            snpeff_csvstats = full_path(snpeff_csvstats)
 4828            snpeff_options += f" -csvStats {snpeff_csvstats}"
 4829
 4830        # Data
 4831        table_variants = self.get_table_variants()
 4832
 4833        # Check if not empty
 4834        log.debug("Check if not empty")
 4835        sql_query_chromosomes = (
 4836            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4837        )
 4838        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 4839        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4840            log.info(f"VCF empty")
 4841            return
 4842
 4843        # Export in VCF
 4844        log.debug("Create initial file to annotate")
 4845        tmp_vcf = NamedTemporaryFile(
 4846            prefix=self.get_prefix(),
 4847            dir=self.get_tmp_dir(),
 4848            suffix=".vcf.gz",
 4849            delete=True,
 4850        )
 4851        tmp_vcf_name = tmp_vcf.name
 4852
 4853        # VCF header
 4854        vcf_reader = self.get_header()
 4855        log.debug("Initial header: " + str(vcf_reader.infos))
 4856
 4857        # Existing annotations
 4858        for vcf_annotation in self.get_header().infos:
 4859
 4860            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4861            log.debug(
 4862                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4863            )
 4864
 4865        # Memory limit
 4866        # if config.get("memory", None):
 4867        #     memory_limit = config.get("memory", "8G")
 4868        # else:
 4869        #     memory_limit = "8G"
 4870        memory_limit = self.get_memory("8G")
 4871        log.debug(f"memory_limit: {memory_limit}")
 4872
 4873        # snpEff java options
 4874        snpeff_java_options = (
 4875            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4876        )
 4877        log.debug(f"Exomiser java options: {snpeff_java_options}")
 4878
 4879        force_update_annotation = True
 4880
 4881        if "ANN" not in self.get_header().infos or force_update_annotation:
 4882
 4883            # Check snpEff database
 4884            log.debug(f"Check snpEff databases {[assembly]}")
 4885            databases_download_snpeff(
 4886                folder=snpeff_databases, assemblies=[assembly], config=config
 4887            )
 4888
 4889            # Export VCF file
 4890            self.export_variant_vcf(
 4891                vcf_file=tmp_vcf_name,
 4892                remove_info=True,
 4893                add_samples=False,
 4894                index=True,
 4895            )
 4896
 4897            # Tmp file
 4898            err_files = []
 4899            tmp_annotate_vcf = NamedTemporaryFile(
 4900                prefix=self.get_prefix(),
 4901                dir=self.get_tmp_dir(),
 4902                suffix=".vcf",
 4903                delete=False,
 4904            )
 4905            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4906            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4907            err_files.append(tmp_annotate_vcf_name_err)
 4908
 4909            # Command
 4910            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 4911            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 4912            run_parallel_commands([snpeff_command], 1)
 4913
 4914            # Error messages
 4915            log.info(f"Error/Warning messages:")
 4916            error_message_command_all = []
 4917            error_message_command_warning = []
 4918            error_message_command_err = []
 4919            for err_file in err_files:
 4920                with open(err_file, "r") as f:
 4921                    for line in f:
 4922                        message = line.strip()
 4923                        error_message_command_all.append(message)
 4924                        if line.startswith("[W::"):
 4925                            error_message_command_warning.append(message)
 4926                        if line.startswith("[E::"):
 4927                            error_message_command_err.append(f"{err_file}: " + message)
 4928            # log info
 4929            for message in list(
 4930                set(error_message_command_err + error_message_command_warning)
 4931            ):
 4932                log.info(f"   {message}")
 4933            # debug info
 4934            for message in list(set(error_message_command_all)):
 4935                log.debug(f"   {message}")
 4936            # failed
 4937            if len(error_message_command_err):
 4938                log.error("Annotation failed: Error in commands")
 4939                raise ValueError("Annotation failed: Error in commands")
 4940
 4941            # Find annotation in header
 4942            with open(tmp_annotate_vcf_name, "rt") as f:
 4943                header_list = self.read_vcf_header(f)
 4944            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 4945
 4946            for ann in annovar_vcf_header.infos:
 4947                if ann not in self.get_header().infos:
 4948                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 4949
 4950            # Update variants
 4951            log.info(f"Annotation - Updating...")
 4952            self.update_from_vcf(tmp_annotate_vcf_name)
 4953
 4954        else:
 4955            if "ANN" in self.get_header().infos:
 4956                log.debug(f"Existing snpEff annotations in VCF")
 4957            if force_update_annotation:
 4958                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 4959
 4960    def annotation_annovar(self, threads: int = None) -> None:
 4961        """
 4962        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 4963        annotations
 4964
 4965        :param threads: number of threads to use
 4966        :return: the value of the variable "return_value".
 4967        """
 4968
 4969        # DEBUG
 4970        log.debug("Start annotation with Annovar databases")
 4971
 4972        # Threads
 4973        if not threads:
 4974            threads = self.get_threads()
 4975        log.debug("Threads: " + str(threads))
 4976
 4977        # Tmp en Err files
 4978        tmp_files = []
 4979        err_files = []
 4980
 4981        # DEBUG
 4982        delete_tmp = True
 4983        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4984            delete_tmp = False
 4985            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4986
 4987        # Config
 4988        config = self.get_config()
 4989        log.debug("Config: " + str(config))
 4990
 4991        # Config - Folders - Databases
 4992        databases_folders = (
 4993            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 4994        )
 4995        log.debug("Databases annotations: " + str(databases_folders))
 4996
 4997        # Config - annovar bin command
 4998        annovar_bin_command = get_bin_command(
 4999            bin="table_annovar.pl",
 5000            tool="annovar",
 5001            bin_type="perl",
 5002            config=config,
 5003            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5004        )
 5005        if not annovar_bin_command:
 5006            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5007            log.error(msg_err)
 5008            raise ValueError(msg_err)
 5009
 5010        # Config - BCFTools bin command
 5011        bcftools_bin_command = get_bin_command(
 5012            bin="bcftools",
 5013            tool="bcftools",
 5014            bin_type="bin",
 5015            config=config,
 5016            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5017        )
 5018        if not bcftools_bin_command:
 5019            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5020            log.error(msg_err)
 5021            raise ValueError(msg_err)
 5022
 5023        # Config - annovar databases
 5024        annovar_databases = (
 5025            config.get("folders", {})
 5026            .get("databases", {})
 5027            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5028        )
 5029        annovar_databases = full_path(annovar_databases)
 5030        if annovar_databases != "" and not os.path.exists(annovar_databases):
 5031            os.makedirs(annovar_databases)
 5032
 5033        # Param
 5034        param = self.get_param()
 5035        log.debug("Param: " + str(param))
 5036
 5037        # Param - options
 5038        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5039        log.debug("Options: " + str(options))
 5040
 5041        # Param - annotations
 5042        annotations = (
 5043            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5044        )
 5045        log.debug("Annotations: " + str(annotations))
 5046
 5047        # Param - Assembly
 5048        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5049
 5050        # Annovar database assembly
 5051        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5052        if annovar_databases_assembly != "" and not os.path.exists(
 5053            annovar_databases_assembly
 5054        ):
 5055            os.makedirs(annovar_databases_assembly)
 5056
 5057        # Data
 5058        table_variants = self.get_table_variants()
 5059
 5060        # Check if not empty
 5061        log.debug("Check if not empty")
 5062        sql_query_chromosomes = (
 5063            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5064        )
 5065        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5066        if not sql_query_chromosomes_df["count"][0]:
 5067            log.info(f"VCF empty")
 5068            return
 5069
 5070        # VCF header
 5071        vcf_reader = self.get_header()
 5072        log.debug("Initial header: " + str(vcf_reader.infos))
 5073
 5074        # Existing annotations
 5075        for vcf_annotation in self.get_header().infos:
 5076
 5077            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5078            log.debug(
 5079                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5080            )
 5081
 5082        force_update_annotation = True
 5083
 5084        if annotations:
 5085
 5086            commands = []
 5087            tmp_annotates_vcf_name_list = []
 5088
 5089            # Export in VCF
 5090            log.debug("Create initial file to annotate")
 5091            tmp_vcf = NamedTemporaryFile(
 5092                prefix=self.get_prefix(),
 5093                dir=self.get_tmp_dir(),
 5094                suffix=".vcf.gz",
 5095                delete=False,
 5096            )
 5097            tmp_vcf_name = tmp_vcf.name
 5098            tmp_files.append(tmp_vcf_name)
 5099            tmp_files.append(tmp_vcf_name + ".tbi")
 5100
 5101            # Export VCF file
 5102            self.export_variant_vcf(
 5103                vcf_file=tmp_vcf_name,
 5104                remove_info=".",
 5105                add_samples=False,
 5106                index=True,
 5107            )
 5108
 5109            # Create file for field rename
 5110            log.debug("Create file for field rename")
 5111            tmp_rename = NamedTemporaryFile(
 5112                prefix=self.get_prefix(),
 5113                dir=self.get_tmp_dir(),
 5114                suffix=".rename",
 5115                delete=False,
 5116            )
 5117            tmp_rename_name = tmp_rename.name
 5118            tmp_files.append(tmp_rename_name)
 5119
 5120            # Check Annovar database
 5121            log.debug(
 5122                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5123            )
 5124            databases_download_annovar(
 5125                folder=annovar_databases,
 5126                files=list(annotations.keys()),
 5127                assemblies=[assembly],
 5128            )
 5129
 5130            for annotation in annotations:
 5131                annotation_fields = annotations[annotation]
 5132
 5133                if not annotation_fields:
 5134                    annotation_fields = {"INFO": None}
 5135
 5136                log.info(f"Annotations Annovar - database '{annotation}'")
 5137                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5138
 5139                # Tmp file for annovar
 5140                err_files = []
 5141                tmp_annotate_vcf_directory = TemporaryDirectory(
 5142                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5143                )
 5144                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5145                tmp_annotate_vcf_name_annovar = (
 5146                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5147                )
 5148                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5149                err_files.append(tmp_annotate_vcf_name_err)
 5150                tmp_files.append(tmp_annotate_vcf_name_err)
 5151
 5152                # Tmp file final vcf annotated by annovar
 5153                tmp_annotate_vcf = NamedTemporaryFile(
 5154                    prefix=self.get_prefix(),
 5155                    dir=self.get_tmp_dir(),
 5156                    suffix=".vcf.gz",
 5157                    delete=False,
 5158                )
 5159                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5160                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5161                tmp_files.append(tmp_annotate_vcf_name)
 5162                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5163
 5164                # Number of fields
 5165                annotation_list = []
 5166                annotation_renamed_list = []
 5167
 5168                for annotation_field in annotation_fields:
 5169
 5170                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5171                    annotation_fields_new_name = annotation_fields.get(
 5172                        annotation_field, annotation_field
 5173                    )
 5174                    if not annotation_fields_new_name:
 5175                        annotation_fields_new_name = annotation_field
 5176
 5177                    if (
 5178                        force_update_annotation
 5179                        or annotation_fields_new_name not in self.get_header().infos
 5180                    ):
 5181                        annotation_list.append(annotation_field)
 5182                        annotation_renamed_list.append(annotation_fields_new_name)
 5183                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5184                        log.warning(
 5185                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5186                        )
 5187
 5188                    # Add rename info
 5189                    run_parallel_commands(
 5190                        [
 5191                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5192                        ],
 5193                        1,
 5194                    )
 5195
 5196                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5197                log.debug("annotation_list: " + str(annotation_list))
 5198
 5199                # protocol
 5200                protocol = annotation
 5201
 5202                # argument
 5203                argument = ""
 5204
 5205                # operation
 5206                operation = "f"
 5207                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5208                    "ensGene"
 5209                ):
 5210                    operation = "g"
 5211                    if options.get("genebase", None):
 5212                        argument = f"""'{options.get("genebase","")}'"""
 5213                elif annotation in ["cytoBand"]:
 5214                    operation = "r"
 5215
 5216                # argument option
 5217                argument_option = ""
 5218                if argument != "":
 5219                    argument_option = " --argument " + argument
 5220
 5221                # command options
 5222                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5223                for option in options:
 5224                    if option not in ["genebase"]:
 5225                        command_options += f""" --{option}={options[option]}"""
 5226
 5227                # Command
 5228
 5229                # Command - Annovar
 5230                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5231                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5232
 5233                # Command - start pipe
 5234                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5235
 5236                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5237                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5238
 5239                # Command - Special characters (refGene annotation)
 5240                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5241
 5242                # Command - Clean empty fields (with value ".")
 5243                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5244
 5245                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5246                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5247                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5248                    # for ann in annotation_renamed_list:
 5249                    for ann in annotation_list:
 5250                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5251
 5252                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5253
 5254                # Command - indexing
 5255                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5256
 5257                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5258                run_parallel_commands([command_annovar], 1)
 5259
 5260                # Error messages
 5261                log.info(f"Error/Warning messages:")
 5262                error_message_command_all = []
 5263                error_message_command_warning = []
 5264                error_message_command_err = []
 5265                for err_file in err_files:
 5266                    with open(err_file, "r") as f:
 5267                        for line in f:
 5268                            message = line.strip()
 5269                            error_message_command_all.append(message)
 5270                            if line.startswith("[W::") or line.startswith("WARNING"):
 5271                                error_message_command_warning.append(message)
 5272                            if line.startswith("[E::") or line.startswith("ERROR"):
 5273                                error_message_command_err.append(
 5274                                    f"{err_file}: " + message
 5275                                )
 5276                # log info
 5277                for message in list(
 5278                    set(error_message_command_err + error_message_command_warning)
 5279                ):
 5280                    log.info(f"   {message}")
 5281                # debug info
 5282                for message in list(set(error_message_command_all)):
 5283                    log.debug(f"   {message}")
 5284                # failed
 5285                if len(error_message_command_err):
 5286                    log.error("Annotation failed: Error in commands")
 5287                    raise ValueError("Annotation failed: Error in commands")
 5288
 5289            if tmp_annotates_vcf_name_list:
 5290
 5291                # List of annotated files
 5292                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5293
 5294                # Tmp file
 5295                tmp_annotate_vcf = NamedTemporaryFile(
 5296                    prefix=self.get_prefix(),
 5297                    dir=self.get_tmp_dir(),
 5298                    suffix=".vcf.gz",
 5299                    delete=False,
 5300                )
 5301                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5302                tmp_files.append(tmp_annotate_vcf_name)
 5303                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5304                err_files.append(tmp_annotate_vcf_name_err)
 5305                tmp_files.append(tmp_annotate_vcf_name_err)
 5306
 5307                # Command merge
 5308                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5309                log.info(
 5310                    f"Annotation Annovar - Annotation merging "
 5311                    + str(len(tmp_annotates_vcf_name_list))
 5312                    + " annotated files"
 5313                )
 5314                log.debug(f"Annotation - merge command: {merge_command}")
 5315                run_parallel_commands([merge_command], 1)
 5316
 5317                # Find annotation in header
 5318                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5319                    header_list = self.read_vcf_header(f)
 5320                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5321
 5322                for ann in annovar_vcf_header.infos:
 5323                    if ann not in self.get_header().infos:
 5324                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5325
 5326                # Update variants
 5327                log.info(f"Annotation Annovar - Updating...")
 5328                self.update_from_vcf(tmp_annotate_vcf_name)
 5329
 5330            # Clean files
 5331            # Tmp file remove command
 5332            if True:
 5333                tmp_files_remove_command = ""
 5334                if tmp_files:
 5335                    tmp_files_remove_command = " ".join(tmp_files)
 5336                clean_command = f" rm -f {tmp_files_remove_command} "
 5337                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5338                log.debug(f"Annotation - cleaning command: {clean_command}")
 5339                run_parallel_commands([clean_command], 1)
 5340
 5341    # Parquet
 5342    def annotation_parquet(self, threads: int = None) -> None:
 5343        """
 5344        It takes a VCF file, and annotates it with a parquet file
 5345
 5346        :param threads: number of threads to use for the annotation
 5347        :return: the value of the variable "result".
 5348        """
 5349
 5350        # DEBUG
 5351        log.debug("Start annotation with parquet databases")
 5352
 5353        # Threads
 5354        if not threads:
 5355            threads = self.get_threads()
 5356        log.debug("Threads: " + str(threads))
 5357
 5358        # DEBUG
 5359        delete_tmp = True
 5360        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5361            delete_tmp = False
 5362            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5363
 5364        # Config
 5365        databases_folders = set(
 5366            self.get_config()
 5367            .get("folders", {})
 5368            .get("databases", {})
 5369            .get("annotations", ["."])
 5370            + self.get_config()
 5371            .get("folders", {})
 5372            .get("databases", {})
 5373            .get("parquet", ["."])
 5374        )
 5375        log.debug("Databases annotations: " + str(databases_folders))
 5376
 5377        # Param
 5378        annotations = (
 5379            self.get_param()
 5380            .get("annotation", {})
 5381            .get("parquet", {})
 5382            .get("annotations", None)
 5383        )
 5384        log.debug("Annotations: " + str(annotations))
 5385
 5386        # Assembly
 5387        assembly = self.get_param().get(
 5388            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5389        )
 5390
 5391        # Force Update Annotation
 5392        force_update_annotation = (
 5393            self.get_param()
 5394            .get("annotation", {})
 5395            .get("options", {})
 5396            .get("annotations_update", False)
 5397        )
 5398        log.debug(f"force_update_annotation={force_update_annotation}")
 5399        force_append_annotation = (
 5400            self.get_param()
 5401            .get("annotation", {})
 5402            .get("options", {})
 5403            .get("annotations_append", False)
 5404        )
 5405        log.debug(f"force_append_annotation={force_append_annotation}")
 5406
 5407        # Data
 5408        table_variants = self.get_table_variants()
 5409
 5410        # Check if not empty
 5411        log.debug("Check if not empty")
 5412        sql_query_chromosomes_df = self.get_query_to_df(
 5413            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5414        )
 5415        if not sql_query_chromosomes_df["count"][0]:
 5416            log.info(f"VCF empty")
 5417            return
 5418
 5419        # VCF header
 5420        vcf_reader = self.get_header()
 5421        log.debug("Initial header: " + str(vcf_reader.infos))
 5422
 5423        # Nb Variants POS
 5424        log.debug("NB Variants Start")
 5425        nb_variants = self.conn.execute(
 5426            f"SELECT count(*) AS count FROM variants"
 5427        ).fetchdf()["count"][0]
 5428        log.debug("NB Variants Stop")
 5429
 5430        # Existing annotations
 5431        for vcf_annotation in self.get_header().infos:
 5432
 5433            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5434            log.debug(
 5435                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5436            )
 5437
 5438        # Added columns
 5439        added_columns = []
 5440
 5441        # drop indexes
 5442        log.debug(f"Drop indexes...")
 5443        self.drop_indexes()
 5444
 5445        if annotations:
 5446
 5447            if "ALL" in annotations:
 5448
 5449                all_param = annotations.get("ALL", {})
 5450                all_param_formats = all_param.get("formats", None)
 5451                all_param_releases = all_param.get("releases", None)
 5452
 5453                databases_infos_dict = self.scan_databases(
 5454                    database_formats=all_param_formats,
 5455                    database_releases=all_param_releases,
 5456                )
 5457                for database_infos in databases_infos_dict.keys():
 5458                    if database_infos not in annotations:
 5459                        annotations[database_infos] = {"INFO": None}
 5460
 5461            for annotation in annotations:
 5462
 5463                if annotation in ["ALL"]:
 5464                    continue
 5465
 5466                # Annotation Name
 5467                annotation_name = os.path.basename(annotation)
 5468
 5469                # Annotation fields
 5470                annotation_fields = annotations[annotation]
 5471                if not annotation_fields:
 5472                    annotation_fields = {"INFO": None}
 5473
 5474                log.debug(f"Annotation '{annotation_name}'")
 5475                log.debug(
 5476                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5477                )
 5478
 5479                # Create Database
 5480                database = Database(
 5481                    database=annotation,
 5482                    databases_folders=databases_folders,
 5483                    assembly=assembly,
 5484                )
 5485
 5486                # Find files
 5487                parquet_file = database.get_database()
 5488                parquet_hdr_file = database.get_header_file()
 5489                parquet_type = database.get_type()
 5490
 5491                # Check if files exists
 5492                if not parquet_file or not parquet_hdr_file:
 5493                    log.error("Annotation failed: file not found")
 5494                    raise ValueError("Annotation failed: file not found")
 5495                else:
 5496                    # Get parquet connexion
 5497                    parquet_sql_attach = database.get_sql_database_attach(
 5498                        output="query"
 5499                    )
 5500                    if parquet_sql_attach:
 5501                        self.conn.execute(parquet_sql_attach)
 5502                    parquet_file_link = database.get_sql_database_link()
 5503                    # Log
 5504                    log.debug(
 5505                        f"Annotation '{annotation_name}' - file: "
 5506                        + str(parquet_file)
 5507                        + " and "
 5508                        + str(parquet_hdr_file)
 5509                    )
 5510
 5511                    # Database full header columns
 5512                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 5513                        parquet_hdr_file
 5514                    )
 5515                    # Log
 5516                    log.debug(
 5517                        "Annotation database header columns : "
 5518                        + str(parquet_hdr_vcf_header_columns)
 5519                    )
 5520
 5521                    # Load header as VCF object
 5522                    parquet_hdr_vcf_header_infos = database.get_header().infos
 5523                    # Log
 5524                    log.debug(
 5525                        "Annotation database header: "
 5526                        + str(parquet_hdr_vcf_header_infos)
 5527                    )
 5528
 5529                    # Get extra infos
 5530                    parquet_columns = database.get_extra_columns()
 5531                    # Log
 5532                    log.debug("Annotation database Columns: " + str(parquet_columns))
 5533
 5534                    # Add extra columns if "ALL" in annotation_fields
 5535                    # if "ALL" in annotation_fields:
 5536                    #     allow_add_extra_column = True
 5537                    if "ALL" in annotation_fields and database.get_extra_columns():
 5538                        for extra_column in database.get_extra_columns():
 5539                            if (
 5540                                extra_column not in annotation_fields
 5541                                and extra_column.replace("INFO/", "")
 5542                                not in parquet_hdr_vcf_header_infos
 5543                            ):
 5544                                parquet_hdr_vcf_header_infos[extra_column] = (
 5545                                    vcf.parser._Info(
 5546                                        extra_column,
 5547                                        ".",
 5548                                        "String",
 5549                                        f"{extra_column} description",
 5550                                        "unknown",
 5551                                        "unknown",
 5552                                        self.code_type_map["String"],
 5553                                    )
 5554                                )
 5555
 5556                    # For all fields in database
 5557                    annotation_fields_all = False
 5558                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 5559                        annotation_fields_all = True
 5560                        annotation_fields = {
 5561                            key: key for key in parquet_hdr_vcf_header_infos
 5562                        }
 5563
 5564                        log.debug(
 5565                            "Annotation database header - All annotations added: "
 5566                            + str(annotation_fields)
 5567                        )
 5568
 5569                    # Init
 5570
 5571                    # List of annotation fields to use
 5572                    sql_query_annotation_update_info_sets = []
 5573
 5574                    # List of annotation to agregate
 5575                    sql_query_annotation_to_agregate = []
 5576
 5577                    # Number of fields
 5578                    nb_annotation_field = 0
 5579
 5580                    # Annotation fields processed
 5581                    annotation_fields_processed = []
 5582
 5583                    # Columns mapping
 5584                    map_columns = database.map_columns(
 5585                        columns=annotation_fields, prefixes=["INFO/"]
 5586                    )
 5587
 5588                    # Query dict for fields to remove (update option)
 5589                    query_dict_remove = {}
 5590
 5591                    # Fetch Anotation fields
 5592                    for annotation_field in annotation_fields:
 5593
 5594                        # annotation_field_column
 5595                        annotation_field_column = map_columns.get(
 5596                            annotation_field, "INFO"
 5597                        )
 5598
 5599                        # field new name, if parametered
 5600                        annotation_fields_new_name = annotation_fields.get(
 5601                            annotation_field, annotation_field
 5602                        )
 5603                        if not annotation_fields_new_name:
 5604                            annotation_fields_new_name = annotation_field
 5605
 5606                        # To annotate
 5607                        # force_update_annotation = True
 5608                        # force_append_annotation = True
 5609                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 5610                        if annotation_field in parquet_hdr_vcf_header_infos and (
 5611                            force_update_annotation
 5612                            or force_append_annotation
 5613                            or (
 5614                                annotation_fields_new_name
 5615                                not in self.get_header().infos
 5616                            )
 5617                        ):
 5618
 5619                            # Add field to annotation to process list
 5620                            annotation_fields_processed.append(
 5621                                annotation_fields_new_name
 5622                            )
 5623
 5624                            # explode infos for the field
 5625                            annotation_fields_new_name_info_msg = ""
 5626                            if (
 5627                                force_update_annotation
 5628                                and annotation_fields_new_name
 5629                                in self.get_header().infos
 5630                            ):
 5631                                # Remove field from INFO
 5632                                query = f"""
 5633                                    UPDATE {table_variants} as table_variants
 5634                                    SET INFO = REGEXP_REPLACE(
 5635                                                concat(table_variants.INFO,''),
 5636                                                ';*{annotation_fields_new_name}=[^;]*',
 5637                                                ''
 5638                                                )
 5639                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 5640                                """
 5641                                annotation_fields_new_name_info_msg = " [update]"
 5642                                query_dict_remove[
 5643                                    f"remove 'INFO/{annotation_fields_new_name}'"
 5644                                ] = query
 5645
 5646                            # Sep between fields in INFO
 5647                            nb_annotation_field += 1
 5648                            if nb_annotation_field > 1:
 5649                                annotation_field_sep = ";"
 5650                            else:
 5651                                annotation_field_sep = ""
 5652
 5653                            log.info(
 5654                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 5655                            )
 5656
 5657                            # Add INFO field to header
 5658                            parquet_hdr_vcf_header_infos_number = (
 5659                                parquet_hdr_vcf_header_infos[annotation_field].num
 5660                                or "."
 5661                            )
 5662                            parquet_hdr_vcf_header_infos_type = (
 5663                                parquet_hdr_vcf_header_infos[annotation_field].type
 5664                                or "String"
 5665                            )
 5666                            parquet_hdr_vcf_header_infos_description = (
 5667                                parquet_hdr_vcf_header_infos[annotation_field].desc
 5668                                or f"{annotation_field} description"
 5669                            )
 5670                            parquet_hdr_vcf_header_infos_source = (
 5671                                parquet_hdr_vcf_header_infos[annotation_field].source
 5672                                or "unknown"
 5673                            )
 5674                            parquet_hdr_vcf_header_infos_version = (
 5675                                parquet_hdr_vcf_header_infos[annotation_field].version
 5676                                or "unknown"
 5677                            )
 5678
 5679                            vcf_reader.infos[annotation_fields_new_name] = (
 5680                                vcf.parser._Info(
 5681                                    annotation_fields_new_name,
 5682                                    parquet_hdr_vcf_header_infos_number,
 5683                                    parquet_hdr_vcf_header_infos_type,
 5684                                    parquet_hdr_vcf_header_infos_description,
 5685                                    parquet_hdr_vcf_header_infos_source,
 5686                                    parquet_hdr_vcf_header_infos_version,
 5687                                    self.code_type_map[
 5688                                        parquet_hdr_vcf_header_infos_type
 5689                                    ],
 5690                                )
 5691                            )
 5692
 5693                            # Append
 5694                            if force_append_annotation:
 5695                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 5696                            else:
 5697                                query_case_when_append = ""
 5698
 5699                            # Annotation/Update query fields
 5700                            # Found in INFO column
 5701                            if (
 5702                                annotation_field_column == "INFO"
 5703                                and "INFO" in parquet_hdr_vcf_header_columns
 5704                            ):
 5705                                sql_query_annotation_update_info_sets.append(
 5706                                    f"""
 5707                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 5708                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 5709                                        ELSE ''
 5710                                    END
 5711                                """
 5712                                )
 5713                            # Found in a specific column
 5714                            else:
 5715                                sql_query_annotation_update_info_sets.append(
 5716                                    f"""
 5717                                CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
 5718                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ','))
 5719                                        ELSE ''
 5720                                    END
 5721                                """
 5722                                )
 5723                                sql_query_annotation_to_agregate.append(
 5724                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 5725                                )
 5726
 5727                        # Not to annotate
 5728                        else:
 5729
 5730                            if force_update_annotation:
 5731                                annotation_message = "forced"
 5732                            else:
 5733                                annotation_message = "skipped"
 5734
 5735                            if annotation_field not in parquet_hdr_vcf_header_infos:
 5736                                log.warning(
 5737                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 5738                                )
 5739                            if annotation_fields_new_name in self.get_header().infos:
 5740                                log.warning(
 5741                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 5742                                )
 5743
 5744                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 5745                    # allow_annotation_full_info = True
 5746                    allow_annotation_full_info = not force_append_annotation
 5747
 5748                    if parquet_type in ["regions"]:
 5749                        allow_annotation_full_info = False
 5750
 5751                    if (
 5752                        allow_annotation_full_info
 5753                        and nb_annotation_field == len(annotation_fields)
 5754                        and annotation_fields_all
 5755                        and (
 5756                            "INFO" in parquet_hdr_vcf_header_columns
 5757                            and "INFO" in database.get_extra_columns()
 5758                        )
 5759                    ):
 5760                        log.debug("Column INFO annotation enabled")
 5761                        sql_query_annotation_update_info_sets = []
 5762                        sql_query_annotation_update_info_sets.append(
 5763                            f" table_parquet.INFO "
 5764                        )
 5765
 5766                    if sql_query_annotation_update_info_sets:
 5767
 5768                        # Annotate
 5769                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 5770
 5771                        # Join query annotation update info sets for SQL
 5772                        sql_query_annotation_update_info_sets_sql = ",".join(
 5773                            sql_query_annotation_update_info_sets
 5774                        )
 5775
 5776                        # Check chromosomes list (and variants infos)
 5777                        sql_query_chromosomes = f"""
 5778                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 5779                            FROM {table_variants} as table_variants
 5780                            GROUP BY table_variants."#CHROM"
 5781                            ORDER BY table_variants."#CHROM"
 5782                            """
 5783                        sql_query_chromosomes_df = self.conn.execute(
 5784                            sql_query_chromosomes
 5785                        ).df()
 5786                        sql_query_chromosomes_dict = {
 5787                            entry["CHROM"]: {
 5788                                "count": entry["count_variants"],
 5789                                "min": entry["min_variants"],
 5790                                "max": entry["max_variants"],
 5791                            }
 5792                            for index, entry in sql_query_chromosomes_df.iterrows()
 5793                        }
 5794
 5795                        # Init
 5796                        nb_of_query = 0
 5797                        nb_of_variant_annotated = 0
 5798                        query_dict = query_dict_remove
 5799
 5800                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 5801                        for chrom in sql_query_chromosomes_dict:
 5802
 5803                            # Number of variant by chromosome
 5804                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 5805                                chrom, {}
 5806                            ).get("count", 0)
 5807
 5808                            log.debug(
 5809                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 5810                            )
 5811
 5812                            # Annotation with regions database
 5813                            if parquet_type in ["regions"]:
 5814                                sql_query_annotation_from_clause = f"""
 5815                                    FROM (
 5816                                        SELECT 
 5817                                            '{chrom}' AS \"#CHROM\",
 5818                                            table_variants_from.\"POS\" AS \"POS\",
 5819                                            {",".join(sql_query_annotation_to_agregate)}
 5820                                        FROM {table_variants} as table_variants_from
 5821                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 5822                                            table_parquet_from."#CHROM" = '{chrom}'
 5823                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 5824                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
 5825                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 5826                                                )
 5827                                        )
 5828                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 5829                                        GROUP BY table_variants_from.\"POS\"
 5830                                        )
 5831                                        as table_parquet
 5832                                """
 5833
 5834                                sql_query_annotation_where_clause = """
 5835                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 5836                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5837                                """
 5838
 5839                            # Annotation with variants database
 5840                            else:
 5841                                sql_query_annotation_from_clause = f"""
 5842                                    FROM {parquet_file_link} as table_parquet
 5843                                """
 5844                                sql_query_annotation_where_clause = f"""
 5845                                    table_variants."#CHROM" = '{chrom}'
 5846                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 5847                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5848                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5849                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5850                                """
 5851
 5852                            # Create update query
 5853                            sql_query_annotation_chrom_interval_pos = f"""
 5854                                UPDATE {table_variants} as table_variants
 5855                                    SET INFO = 
 5856                                        concat(
 5857                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5858                                                THEN table_variants.INFO
 5859                                                ELSE ''
 5860                                            END
 5861                                            ,
 5862                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5863                                                        AND (
 5864                                                        concat({sql_query_annotation_update_info_sets_sql})
 5865                                                        )
 5866                                                        NOT IN ('','.') 
 5867                                                    THEN ';'
 5868                                                    ELSE ''
 5869                                            END
 5870                                            ,
 5871                                            {sql_query_annotation_update_info_sets_sql}
 5872                                            )
 5873                                    {sql_query_annotation_from_clause}
 5874                                    WHERE {sql_query_annotation_where_clause}
 5875                                    ;
 5876                                """
 5877
 5878                            # Add update query to dict
 5879                            query_dict[
 5880                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 5881                            ] = sql_query_annotation_chrom_interval_pos
 5882
 5883                        nb_of_query = len(query_dict)
 5884                        num_query = 0
 5885
 5886                        # SET max_expression_depth TO x
 5887                        self.conn.execute("SET max_expression_depth TO 10000")
 5888
 5889                        for query_name in query_dict:
 5890                            query = query_dict[query_name]
 5891                            num_query += 1
 5892                            log.info(
 5893                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 5894                            )
 5895                            result = self.conn.execute(query)
 5896                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 5897                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 5898                            log.info(
 5899                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 5900                            )
 5901
 5902                        log.info(
 5903                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 5904                        )
 5905
 5906                    else:
 5907
 5908                        log.info(
 5909                            f"Annotation '{annotation_name}' - No Annotations available"
 5910                        )
 5911
 5912                    log.debug("Final header: " + str(vcf_reader.infos))
 5913
 5914        # Remove added columns
 5915        for added_column in added_columns:
 5916            self.drop_column(column=added_column)
 5917
 5918    def annotation_splice(self, threads: int = None) -> None:
 5919        """
 5920        This function annotate with snpEff
 5921
 5922        :param threads: The number of threads to use
 5923        :return: the value of the variable "return_value".
 5924        """
 5925
 5926        # DEBUG
 5927        log.debug("Start annotation with splice tools")
 5928
 5929        # Threads
 5930        if not threads:
 5931            threads = self.get_threads()
 5932        log.debug("Threads: " + str(threads))
 5933
 5934        # DEBUG
 5935        delete_tmp = True
 5936        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5937            delete_tmp = False
 5938            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5939
 5940        # Config
 5941        config = self.get_config()
 5942        log.debug("Config: " + str(config))
 5943        splice_config = config.get("tools", {}).get("splice", {})
 5944        if not splice_config:
 5945            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 5946        if not splice_config:
 5947            msg_err = "No Splice tool config"
 5948            log.error(msg_err)
 5949            raise ValueError(msg_err)
 5950        log.debug(f"splice_config={splice_config}")
 5951
 5952        # Config - Folders - Databases
 5953        databases_folders = (
 5954            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 5955        )
 5956        log.debug("Databases annotations: " + str(databases_folders))
 5957
 5958        # Splice docker image
 5959        splice_docker_image = splice_config.get("docker").get("image")
 5960
 5961        # Pull splice image if it's not already there
 5962        if not check_docker_image_exists(splice_docker_image):
 5963            log.warning(
 5964                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 5965            )
 5966            try:
 5967                command(f"docker pull {splice_config.get('docker').get('image')}")
 5968            except subprocess.CalledProcessError:
 5969                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 5970                log.error(msg_err)
 5971                raise ValueError(msg_err)
 5972                return None
 5973
 5974        # Config - splice databases
 5975        splice_databases = (
 5976            config.get("folders", {})
 5977            .get("databases", {})
 5978            .get("splice", DEFAULT_SPLICE_FOLDER)
 5979        )
 5980        splice_databases = full_path(splice_databases)
 5981
 5982        # Param
 5983        param = self.get_param()
 5984        log.debug("Param: " + str(param))
 5985
 5986        # Param
 5987        options = param.get("annotation", {}).get("splice", {})
 5988        log.debug("Options: " + str(options))
 5989
 5990        # Data
 5991        table_variants = self.get_table_variants()
 5992
 5993        # Check if not empty
 5994        log.debug("Check if not empty")
 5995        sql_query_chromosomes = (
 5996            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5997        )
 5998        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 5999            log.info("VCF empty")
 6000            return None
 6001
 6002        # Export in VCF
 6003        log.debug("Create initial file to annotate")
 6004
 6005        # Create output folder
 6006        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6007        if not os.path.exists(output_folder):
 6008            Path(output_folder).mkdir(parents=True, exist_ok=True)
 6009
 6010        # Create tmp VCF file
 6011        tmp_vcf = NamedTemporaryFile(
 6012            prefix=self.get_prefix(),
 6013            dir=output_folder,
 6014            suffix=".vcf",
 6015            delete=False,
 6016        )
 6017        tmp_vcf_name = tmp_vcf.name
 6018
 6019        # VCF header
 6020        header = self.get_header()
 6021
 6022        # Existing annotations
 6023        for vcf_annotation in self.get_header().infos:
 6024
 6025            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6026            log.debug(
 6027                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6028            )
 6029
 6030        # Memory limit
 6031        if config.get("memory", None):
 6032            memory_limit = config.get("memory", "8G").upper()
 6033            # upper()
 6034        else:
 6035            memory_limit = "8G"
 6036        log.debug(f"memory_limit: {memory_limit}")
 6037
 6038        # Check number of variants to annotate
 6039        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6040        where_clause_regex_spip = r"SPiP_\w+"
 6041        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6042        df_list_of_variants_to_annotate = self.get_query_to_df(
 6043            query=f""" SELECT * FROM variants {where_clause} """
 6044        )
 6045        if len(df_list_of_variants_to_annotate) == 0:
 6046            log.warning(
 6047                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6048            )
 6049            return None
 6050        else:
 6051            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6052
 6053        # Export VCF file
 6054        self.export_variant_vcf(
 6055            vcf_file=tmp_vcf_name,
 6056            remove_info=True,
 6057            add_samples=True,
 6058            index=False,
 6059            where_clause=where_clause,
 6060        )
 6061
 6062        # Create docker container and launch splice analysis
 6063        if splice_config:
 6064
 6065            # Splice mount folders
 6066            mount_folders = splice_config.get("mount", {})
 6067
 6068            # Genome mount
 6069            mount_folders[
 6070                config.get("folders", {})
 6071                .get("databases", {})
 6072                .get("genomes", DEFAULT_GENOME_FOLDER)
 6073            ] = "ro"
 6074
 6075            # SpliceAI mount
 6076            mount_folders[
 6077                config.get("folders", {})
 6078                .get("databases", {})
 6079                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
 6080            ] = "ro"
 6081
 6082            # Genome mount
 6083            mount_folders[
 6084                config.get("folders", {})
 6085                .get("databases", {})
 6086                .get("spip", DEFAULT_SPIP_FOLDER)
 6087            ] = "ro"
 6088
 6089            # Mount folders
 6090            mount = []
 6091
 6092            # Config mount
 6093            mount = [
 6094                f"-v {full_path(path)}:{full_path(path)}:{mode}"
 6095                for path, mode in mount_folders.items()
 6096            ]
 6097
 6098            if any(value for value in splice_config.values() if value is None):
 6099                log.warning("At least one splice config parameter is empty")
 6100                return None
 6101
 6102            # Params in splice nf
 6103            def check_values(dico: dict):
 6104                """
 6105                Ensure parameters for NF splice pipeline
 6106                """
 6107                for key, val in dico.items():
 6108                    if key == "genome":
 6109                        if any(
 6110                            assemb in options.get("genome", {})
 6111                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6112                        ):
 6113                            yield f"--{key} hg19"
 6114                        elif any(
 6115                            assemb in options.get("genome", {})
 6116                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6117                        ):
 6118                            yield f"--{key} hg38"
 6119                    elif (
 6120                        (isinstance(val, str) and val)
 6121                        or isinstance(val, int)
 6122                        or isinstance(val, bool)
 6123                    ):
 6124                        yield f"--{key} {val}"
 6125
 6126            # Genome
 6127            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6128            options["genome"] = genome
 6129
 6130            # NF params
 6131            nf_params = []
 6132
 6133            # Add options
 6134            if options:
 6135                nf_params = list(check_values(options))
 6136                log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6137            else:
 6138                log.debug("No NF params provided")
 6139
 6140            # Add threads
 6141            if "threads" not in options.keys():
 6142                nf_params.append(f"--threads {threads}")
 6143
 6144            # Genome path
 6145            genome_path = find_genome(
 6146                config.get("folders", {})
 6147                .get("databases", {})
 6148                .get("genomes", DEFAULT_GENOME_FOLDER),
 6149                file=f"{genome}.fa",
 6150            )
 6151            # Add genome path
 6152            if not genome_path:
 6153                raise ValueError(
 6154                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6155                )
 6156            else:
 6157                log.debug(f"Genome: {genome_path}")
 6158                nf_params.append(f"--genome_path {genome_path}")
 6159
 6160            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6161                """
 6162                Setting up updated databases for SPiP and SpliceAI
 6163                """
 6164
 6165                try:
 6166
 6167                    # SpliceAI assembly transcriptome
 6168                    spliceai_assembly = os.path.join(
 6169                        config.get("folders", {})
 6170                        .get("databases", {})
 6171                        .get("spliceai", {}),
 6172                        options.get("genome"),
 6173                        "transcriptome",
 6174                    )
 6175                    spip_assembly = options.get("genome")
 6176
 6177                    spip = find(
 6178                        f"transcriptome_{spip_assembly}.RData",
 6179                        config.get("folders", {}).get("databases", {}).get("spip", {}),
 6180                    )
 6181                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6182                    log.debug(f"SPiP annotations: {spip}")
 6183                    log.debug(f"SpliceAI annotations: {spliceai}")
 6184                    if spip and spliceai:
 6185                        return [
 6186                            f"--spip_transcriptome {spip}",
 6187                            f"--spliceai_annotations {spliceai}",
 6188                        ]
 6189                    else:
 6190                        # TODO crash and go on with basic annotations ?
 6191                        # raise ValueError(
 6192                        #     "Can't find splice databases in configuration EXIT"
 6193                        # )
 6194                        log.warning(
 6195                            "Can't find splice databases in configuration, use annotations file from image"
 6196                        )
 6197                except TypeError:
 6198                    log.warning(
 6199                        "Can't find splice databases in configuration, use annotations file from image"
 6200                    )
 6201                    return []
 6202
 6203            # Add options, check if transcriptome option have already beend provided
 6204            if (
 6205                "spip_transcriptome" not in nf_params
 6206                and "spliceai_transcriptome" not in nf_params
 6207            ):
 6208                splice_reference = splice_annotations(options, config)
 6209                if splice_reference:
 6210                    nf_params.extend(splice_reference)
 6211
 6212            nf_params.append(f"--output_folder {output_folder}")
 6213
 6214            random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6215            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6216            log.debug(cmd)
 6217
 6218            splice_config["docker"]["command"] = cmd
 6219
 6220            docker_cmd = get_bin_command(
 6221                tool="splice",
 6222                bin_type="docker",
 6223                config=config,
 6224                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6225                add_options=f"--name {random_uuid} {' '.join(mount)}",
 6226            )
 6227
 6228            # Docker debug
 6229            # if splice_config.get("rm_container"):
 6230            #     rm_container = "--rm"
 6231            # else:
 6232            #     rm_container = ""
 6233            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6234
 6235            log.debug(docker_cmd)
 6236            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6237            log.debug(res.stdout)
 6238            if res.stderr:
 6239                log.error(res.stderr)
 6240            res.check_returncode()
 6241        else:
 6242            log.warning(f"Splice tool configuration not found: {config}")
 6243
 6244        # Update variants
 6245        log.info("Annotation - Updating...")
 6246        # Test find output vcf
 6247        log.debug(
 6248            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6249        )
 6250        output_vcf = []
 6251        # Wrong folder to look in
 6252        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6253            if (
 6254                files
 6255                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6256            ):
 6257                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6258        # log.debug(os.listdir(options.get("output_folder")))
 6259        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6260        if not output_vcf:
 6261            log.debug(
 6262                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6263            )
 6264        else:
 6265            # Get new header from annotated vcf
 6266            log.debug(f"Initial header: {len(header.infos)} fields")
 6267            # Create new header with splice infos
 6268            new_vcf = Variants(input=output_vcf[0])
 6269            new_vcf_header = new_vcf.get_header().infos
 6270            for keys, infos in new_vcf_header.items():
 6271                if keys not in header.infos.keys():
 6272                    header.infos[keys] = infos
 6273            log.debug(f"New header: {len(header.infos)} fields")
 6274            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6275            self.update_from_vcf(output_vcf[0])
 6276
 6277        # Remove folder
 6278        remove_if_exists(output_folder)
 6279
 6280    ###
 6281    # Prioritization
 6282    ###
 6283
 6284    def get_config_default(self, name: str) -> dict:
 6285        """
 6286        The function `get_config_default` returns a dictionary containing default configurations for
 6287        various calculations and prioritizations.
 6288
 6289        :param name: The `get_config_default` function returns a dictionary containing default
 6290        configurations for different calculations and prioritizations. The `name` parameter is used to
 6291        specify which specific configuration to retrieve from the dictionary
 6292        :type name: str
 6293        :return: The function `get_config_default` returns a dictionary containing default configuration
 6294        settings for different calculations and prioritizations. The specific configuration settings are
 6295        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6296        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6297        returned. If there is no match, an empty dictionary is returned.
 6298        """
 6299
 6300        config_default = {
 6301            "calculations": {
 6302                "variant_chr_pos_alt_ref": {
 6303                    "type": "sql",
 6304                    "name": "variant_chr_pos_alt_ref",
 6305                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6306                    "available": False,
 6307                    "output_column_name": "variant_chr_pos_alt_ref",
 6308                    "output_column_type": "String",
 6309                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6310                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6311                    "operation_info": True,
 6312                },
 6313                "VARTYPE": {
 6314                    "type": "sql",
 6315                    "name": "VARTYPE",
 6316                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6317                    "available": True,
 6318                    "output_column_name": "VARTYPE",
 6319                    "output_column_type": "String",
 6320                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6321                    "operation_query": """
 6322                            CASE
 6323                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6324                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6325                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6326                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6327                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6328                                ELSE 'UNDEFINED'
 6329                            END
 6330                            """,
 6331                    "info_fields": ["SVTYPE"],
 6332                    "operation_info": True,
 6333                },
 6334                "snpeff_hgvs": {
 6335                    "type": "python",
 6336                    "name": "snpeff_hgvs",
 6337                    "description": "HGVS nomenclatures from snpEff annotation",
 6338                    "available": True,
 6339                    "function_name": "calculation_extract_snpeff_hgvs",
 6340                    "function_params": ["snpeff_hgvs", "ANN"],
 6341                },
 6342                "snpeff_ann_explode": {
 6343                    "type": "python",
 6344                    "name": "snpeff_ann_explode",
 6345                    "description": "Explode snpEff annotations with uniquify values",
 6346                    "available": True,
 6347                    "function_name": "calculation_snpeff_ann_explode",
 6348                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6349                },
 6350                "snpeff_ann_explode_uniquify": {
 6351                    "type": "python",
 6352                    "name": "snpeff_ann_explode_uniquify",
 6353                    "description": "Explode snpEff annotations",
 6354                    "available": True,
 6355                    "function_name": "calculation_snpeff_ann_explode",
 6356                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6357                },
 6358                "snpeff_ann_explode_json": {
 6359                    "type": "python",
 6360                    "name": "snpeff_ann_explode_json",
 6361                    "description": "Explode snpEff annotations in JSON format",
 6362                    "available": True,
 6363                    "function_name": "calculation_snpeff_ann_explode",
 6364                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6365                },
 6366                "NOMEN": {
 6367                    "type": "python",
 6368                    "name": "NOMEN",
 6369                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
 6370                    "available": True,
 6371                    "function_name": "calculation_extract_nomen",
 6372                    "function_params": [],
 6373                },
 6374                "FINDBYPIPELINE": {
 6375                    "type": "python",
 6376                    "name": "FINDBYPIPELINE",
 6377                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6378                    "available": True,
 6379                    "function_name": "calculation_find_by_pipeline",
 6380                    "function_params": ["findbypipeline"],
 6381                },
 6382                "FINDBYSAMPLE": {
 6383                    "type": "python",
 6384                    "name": "FINDBYSAMPLE",
 6385                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6386                    "available": True,
 6387                    "function_name": "calculation_find_by_pipeline",
 6388                    "function_params": ["findbysample"],
 6389                },
 6390                "GENOTYPECONCORDANCE": {
 6391                    "type": "python",
 6392                    "name": "GENOTYPECONCORDANCE",
 6393                    "description": "Concordance of genotype for multi caller VCF",
 6394                    "available": True,
 6395                    "function_name": "calculation_genotype_concordance",
 6396                    "function_params": [],
 6397                },
 6398                "BARCODE": {
 6399                    "type": "python",
 6400                    "name": "BARCODE",
 6401                    "description": "BARCODE as VaRank tool",
 6402                    "available": True,
 6403                    "function_name": "calculation_barcode",
 6404                    "function_params": [],
 6405                },
 6406                "BARCODEFAMILY": {
 6407                    "type": "python",
 6408                    "name": "BARCODEFAMILY",
 6409                    "description": "BARCODEFAMILY as VaRank tool",
 6410                    "available": True,
 6411                    "function_name": "calculation_barcode_family",
 6412                    "function_params": ["BCF"],
 6413                },
 6414                "TRIO": {
 6415                    "type": "python",
 6416                    "name": "TRIO",
 6417                    "description": "Inheritance for a trio family",
 6418                    "available": True,
 6419                    "function_name": "calculation_trio",
 6420                    "function_params": [],
 6421                },
 6422                "VAF": {
 6423                    "type": "python",
 6424                    "name": "VAF",
 6425                    "description": "Variant Allele Frequency (VAF) harmonization",
 6426                    "available": True,
 6427                    "function_name": "calculation_vaf_normalization",
 6428                    "function_params": [],
 6429                },
 6430                "VAF_stats": {
 6431                    "type": "python",
 6432                    "name": "VAF_stats",
 6433                    "description": "Variant Allele Frequency (VAF) statistics",
 6434                    "available": True,
 6435                    "function_name": "calculation_genotype_stats",
 6436                    "function_params": ["VAF"],
 6437                },
 6438                "DP_stats": {
 6439                    "type": "python",
 6440                    "name": "DP_stats",
 6441                    "description": "Depth (DP) statistics",
 6442                    "available": True,
 6443                    "function_name": "calculation_genotype_stats",
 6444                    "function_params": ["DP"],
 6445                },
 6446                "variant_id": {
 6447                    "type": "python",
 6448                    "name": "variant_id",
 6449                    "description": "Variant ID generated from variant position and type",
 6450                    "available": True,
 6451                    "function_name": "calculation_variant_id",
 6452                    "function_params": [],
 6453                },
 6454                "transcripts_json": {
 6455                    "type": "python",
 6456                    "name": "transcripts_json",
 6457                    "description": "Add transcripts info in JSON format (field 'transcripts_json')",
 6458                    "available": True,
 6459                    "function_name": "calculation_transcripts_json",
 6460                    "function_params": ["transcripts_json"],
 6461                },
 6462                "transcripts_prioritization": {
 6463                    "type": "python",
 6464                    "name": "transcripts_prioritization",
 6465                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6466                    "available": True,
 6467                    "function_name": "calculation_transcripts_prioritization",
 6468                    "function_params": [],
 6469                },
 6470            },
 6471            "prioritizations": {
 6472                "default": {
 6473                    "filter": [
 6474                        {
 6475                            "type": "notequals",
 6476                            "value": "!PASS|\\.",
 6477                            "score": 0,
 6478                            "flag": "FILTERED",
 6479                            "comment": ["Bad variant quality"],
 6480                        },
 6481                        {
 6482                            "type": "equals",
 6483                            "value": "REJECT",
 6484                            "score": -20,
 6485                            "flag": "PASS",
 6486                            "comment": ["Bad variant quality"],
 6487                        },
 6488                    ],
 6489                    "DP": [
 6490                        {
 6491                            "type": "gte",
 6492                            "value": "50",
 6493                            "score": 5,
 6494                            "flag": "PASS",
 6495                            "comment": ["DP higher than 50"],
 6496                        }
 6497                    ],
 6498                    "ANN": [
 6499                        {
 6500                            "type": "contains",
 6501                            "value": "HIGH",
 6502                            "score": 5,
 6503                            "flag": "PASS",
 6504                            "comment": [
 6505                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6506                            ],
 6507                        },
 6508                        {
 6509                            "type": "contains",
 6510                            "value": "MODERATE",
 6511                            "score": 3,
 6512                            "flag": "PASS",
 6513                            "comment": [
 6514                                "A non-disruptive variant that might change protein effectiveness"
 6515                            ],
 6516                        },
 6517                        {
 6518                            "type": "contains",
 6519                            "value": "LOW",
 6520                            "score": 0,
 6521                            "flag": "FILTERED",
 6522                            "comment": [
 6523                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 6524                            ],
 6525                        },
 6526                        {
 6527                            "type": "contains",
 6528                            "value": "MODIFIER",
 6529                            "score": 0,
 6530                            "flag": "FILTERED",
 6531                            "comment": [
 6532                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 6533                            ],
 6534                        },
 6535                    ],
 6536                }
 6537            },
 6538        }
 6539
 6540        return config_default.get(name, None)
 6541
 6542    def get_config_json(
 6543        self, name: str, config_dict: dict = {}, config_file: str = None
 6544    ) -> dict:
 6545        """
 6546        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 6547        default values, a dictionary, and a file.
 6548
 6549        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 6550        the name of the configuration. It is used to identify and retrieve the configuration settings
 6551        for a specific component or module
 6552        :type name: str
 6553        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 6554        dictionary that allows you to provide additional configuration settings or overrides. When you
 6555        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 6556        the key is the configuration setting you want to override or
 6557        :type config_dict: dict
 6558        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 6559        specify the path to a configuration file that contains additional settings. If provided, the
 6560        function will read the contents of this file and update the configuration dictionary with the
 6561        values found in the file, overriding any existing values with the
 6562        :type config_file: str
 6563        :return: The function `get_config_json` returns a dictionary containing the configuration
 6564        settings.
 6565        """
 6566
 6567        # Create with default prioritizations
 6568        config_default = self.get_config_default(name=name)
 6569        configuration = config_default
 6570        # log.debug(f"configuration={configuration}")
 6571
 6572        # Replace prioritizations from dict
 6573        for config in config_dict:
 6574            configuration[config] = config_dict[config]
 6575
 6576        # Replace prioritizations from file
 6577        config_file = full_path(config_file)
 6578        if config_file:
 6579            if os.path.exists(config_file):
 6580                with open(config_file) as config_file_content:
 6581                    config_file_dict = json.load(config_file_content)
 6582                for config in config_file_dict:
 6583                    configuration[config] = config_file_dict[config]
 6584            else:
 6585                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 6586                log.error(msg_error)
 6587                raise ValueError(msg_error)
 6588
 6589        return configuration
 6590
 6591    def prioritization(
 6592        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 6593    ) -> bool:
 6594        """
 6595        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 6596        prioritizes variants based on configured profiles and criteria.
 6597
 6598        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 6599        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 6600        a table name is provided, the method will prioritize the variants in that specific table
 6601        :type table: str
 6602        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 6603        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 6604        provided, the code will use a default prefix value of "PZ"
 6605        :type pz_prefix: str
 6606        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 6607        additional parameters specific to the prioritization process. These parameters can include
 6608        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 6609        configurations needed for the prioritization of variants in a V
 6610        :type pz_param: dict
 6611        :return: A boolean value (True) is being returned from the `prioritization` function.
 6612        """
 6613
 6614        # Config
 6615        config = self.get_config()
 6616
 6617        # Param
 6618        param = self.get_param()
 6619
 6620        # Prioritization param
 6621        if pz_param is not None:
 6622            prioritization_param = pz_param
 6623        else:
 6624            prioritization_param = param.get("prioritization", {})
 6625
 6626        # Configuration profiles
 6627        prioritization_config_file = prioritization_param.get(
 6628            "prioritization_config", None
 6629        )
 6630        prioritization_config_file = full_path(prioritization_config_file)
 6631        prioritizations_config = self.get_config_json(
 6632            name="prioritizations", config_file=prioritization_config_file
 6633        )
 6634
 6635        # Prioritization prefix
 6636        pz_prefix_default = "PZ"
 6637        if pz_prefix is None:
 6638            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 6639
 6640        # Prioritization options
 6641        profiles = prioritization_param.get("profiles", [])
 6642        if isinstance(profiles, str):
 6643            profiles = profiles.split(",")
 6644        pzfields = prioritization_param.get(
 6645            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 6646        )
 6647        if isinstance(pzfields, str):
 6648            pzfields = pzfields.split(",")
 6649        default_profile = prioritization_param.get("default_profile", None)
 6650        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 6651        prioritization_score_mode = prioritization_param.get(
 6652            "prioritization_score_mode", "HOWARD"
 6653        )
 6654
 6655        # Quick Prioritizations
 6656        prioritizations = param.get("prioritizations", None)
 6657        if prioritizations:
 6658            log.info("Quick Prioritization:")
 6659            for profile in prioritizations.split(","):
 6660                if profile not in profiles:
 6661                    profiles.append(profile)
 6662                    log.info(f"   {profile}")
 6663
 6664        # If profile "ALL" provided, all profiles in the config profiles
 6665        if "ALL" in profiles:
 6666            profiles = list(prioritizations_config.keys())
 6667
 6668        for profile in profiles:
 6669            if prioritizations_config.get(profile, None):
 6670                log.debug(f"Profile '{profile}' configured")
 6671            else:
 6672                msg_error = f"Profile '{profile}' NOT configured"
 6673                log.error(msg_error)
 6674                raise ValueError(msg_error)
 6675
 6676        if profiles:
 6677            log.info(f"Prioritization... ")
 6678        else:
 6679            log.debug(f"No profile defined")
 6680            return False
 6681
 6682        if not default_profile and len(profiles):
 6683            default_profile = profiles[0]
 6684
 6685        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 6686        log.debug("Profiles to check: " + str(list(profiles)))
 6687
 6688        # Variables
 6689        if table is not None:
 6690            table_variants = table
 6691        else:
 6692            table_variants = self.get_table_variants(clause="update")
 6693        log.debug(f"Table to prioritize: {table_variants}")
 6694
 6695        # Added columns
 6696        added_columns = []
 6697
 6698        # Create list of PZfields
 6699        # List of PZFields
 6700        list_of_pzfields_original = pzfields + [
 6701            pzfield + pzfields_sep + profile
 6702            for pzfield in pzfields
 6703            for profile in profiles
 6704        ]
 6705        list_of_pzfields = []
 6706        log.debug(f"{list_of_pzfields_original}")
 6707
 6708        # Remove existing PZfields to use if exists
 6709        for pzfield in list_of_pzfields_original:
 6710            if self.get_header().infos.get(pzfield, None) is None:
 6711                list_of_pzfields.append(pzfield)
 6712                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 6713            else:
 6714                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 6715
 6716        if list_of_pzfields:
 6717
 6718            # Explode Infos prefix
 6719            explode_infos_prefix = self.get_explode_infos_prefix()
 6720
 6721            # PZfields tags description
 6722            PZfields_INFOS = {
 6723                f"{pz_prefix}Tags": {
 6724                    "ID": f"{pz_prefix}Tags",
 6725                    "Number": ".",
 6726                    "Type": "String",
 6727                    "Description": "Variant tags based on annotation criteria",
 6728                },
 6729                f"{pz_prefix}Score": {
 6730                    "ID": f"{pz_prefix}Score",
 6731                    "Number": 1,
 6732                    "Type": "Integer",
 6733                    "Description": "Variant score based on annotation criteria",
 6734                },
 6735                f"{pz_prefix}Flag": {
 6736                    "ID": f"{pz_prefix}Flag",
 6737                    "Number": 1,
 6738                    "Type": "String",
 6739                    "Description": "Variant flag based on annotation criteria",
 6740                },
 6741                f"{pz_prefix}Comment": {
 6742                    "ID": f"{pz_prefix}Comment",
 6743                    "Number": ".",
 6744                    "Type": "String",
 6745                    "Description": "Variant comment based on annotation criteria",
 6746                },
 6747                f"{pz_prefix}Infos": {
 6748                    "ID": f"{pz_prefix}Infos",
 6749                    "Number": ".",
 6750                    "Type": "String",
 6751                    "Description": "Variant infos based on annotation criteria",
 6752                },
 6753            }
 6754
 6755            # Create INFO fields if not exist
 6756            for field in PZfields_INFOS:
 6757                field_ID = PZfields_INFOS[field]["ID"]
 6758                field_description = PZfields_INFOS[field]["Description"]
 6759                if field_ID not in self.get_header().infos and field_ID in pzfields:
 6760                    field_description = (
 6761                        PZfields_INFOS[field]["Description"]
 6762                        + f", profile {default_profile}"
 6763                    )
 6764                    self.get_header().infos[field_ID] = vcf.parser._Info(
 6765                        field_ID,
 6766                        PZfields_INFOS[field]["Number"],
 6767                        PZfields_INFOS[field]["Type"],
 6768                        field_description,
 6769                        "unknown",
 6770                        "unknown",
 6771                        code_type_map[PZfields_INFOS[field]["Type"]],
 6772                    )
 6773
 6774            # Create INFO fields if not exist for each profile
 6775            for profile in prioritizations_config:
 6776                if profile in profiles or profiles == []:
 6777                    for field in PZfields_INFOS:
 6778                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 6779                        field_description = (
 6780                            PZfields_INFOS[field]["Description"]
 6781                            + f", profile {profile}"
 6782                        )
 6783                        if (
 6784                            field_ID not in self.get_header().infos
 6785                            and field in pzfields
 6786                        ):
 6787                            self.get_header().infos[field_ID] = vcf.parser._Info(
 6788                                field_ID,
 6789                                PZfields_INFOS[field]["Number"],
 6790                                PZfields_INFOS[field]["Type"],
 6791                                field_description,
 6792                                "unknown",
 6793                                "unknown",
 6794                                code_type_map[PZfields_INFOS[field]["Type"]],
 6795                            )
 6796
 6797            # Header
 6798            for pzfield in list_of_pzfields:
 6799                if re.match(f"{pz_prefix}Score.*", pzfield):
 6800                    added_column = self.add_column(
 6801                        table_name=table_variants,
 6802                        column_name=pzfield,
 6803                        column_type="INTEGER",
 6804                        default_value="0",
 6805                    )
 6806                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 6807                    added_column = self.add_column(
 6808                        table_name=table_variants,
 6809                        column_name=pzfield,
 6810                        column_type="BOOLEAN",
 6811                        default_value="1",
 6812                    )
 6813                else:
 6814                    added_column = self.add_column(
 6815                        table_name=table_variants,
 6816                        column_name=pzfield,
 6817                        column_type="STRING",
 6818                        default_value="''",
 6819                    )
 6820                added_columns.append(added_column)
 6821
 6822            # Profiles
 6823            if profiles:
 6824
 6825                # foreach profile in configuration file
 6826                for profile in prioritizations_config:
 6827
 6828                    # If profile is asked in param, or ALL are asked (empty profile [])
 6829                    if profile in profiles or profiles == []:
 6830                        log.info(f"Profile '{profile}'")
 6831
 6832                        sql_set_info_option = ""
 6833
 6834                        sql_set_info = []
 6835
 6836                        # PZ fields set
 6837
 6838                        # PZScore
 6839                        if (
 6840                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 6841                            in list_of_pzfields
 6842                        ):
 6843                            sql_set_info.append(
 6844                                f"""
 6845                                    concat(
 6846                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 6847                                        {pz_prefix}Score{pzfields_sep}{profile}
 6848                                    ) 
 6849                                """
 6850                            )
 6851                            if (
 6852                                profile == default_profile
 6853                                and f"{pz_prefix}Score" in list_of_pzfields
 6854                            ):
 6855                                sql_set_info.append(
 6856                                    f"""
 6857                                        concat(
 6858                                            '{pz_prefix}Score=',
 6859                                            {pz_prefix}Score{pzfields_sep}{profile}
 6860                                        )
 6861                                    """
 6862                                )
 6863
 6864                        # PZFlag
 6865                        if (
 6866                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 6867                            in list_of_pzfields
 6868                        ):
 6869                            sql_set_info.append(
 6870                                f"""
 6871                                    concat(
 6872                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 6873                                        CASE 
 6874                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 6875                                            THEN 'PASS'
 6876                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 6877                                            THEN 'FILTERED'
 6878                                        END
 6879                                    ) 
 6880                                """
 6881                            )
 6882                            if (
 6883                                profile == default_profile
 6884                                and f"{pz_prefix}Flag" in list_of_pzfields
 6885                            ):
 6886                                sql_set_info.append(
 6887                                    f"""
 6888                                        concat(
 6889                                            '{pz_prefix}Flag=',
 6890                                            CASE 
 6891                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 6892                                                THEN 'PASS'
 6893                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 6894                                                THEN 'FILTERED'
 6895                                            END
 6896                                        )
 6897                                    """
 6898                                )
 6899
 6900                        # PZComment
 6901                        if (
 6902                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 6903                            in list_of_pzfields
 6904                        ):
 6905                            sql_set_info.append(
 6906                                f"""
 6907                                    CASE
 6908                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 6909                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 6910                                        ELSE ''
 6911                                    END
 6912                                """
 6913                            )
 6914                            if (
 6915                                profile == default_profile
 6916                                and f"{pz_prefix}Comment" in list_of_pzfields
 6917                            ):
 6918                                sql_set_info.append(
 6919                                    f"""
 6920                                        CASE
 6921                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 6922                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 6923                                            ELSE ''
 6924                                        END
 6925                                    """
 6926                                )
 6927
 6928                        # PZInfos
 6929                        if (
 6930                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 6931                            in list_of_pzfields
 6932                        ):
 6933                            sql_set_info.append(
 6934                                f"""
 6935                                    CASE
 6936                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 6937                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 6938                                        ELSE ''
 6939                                    END
 6940                                """
 6941                            )
 6942                            if (
 6943                                profile == default_profile
 6944                                and f"{pz_prefix}Infos" in list_of_pzfields
 6945                            ):
 6946                                sql_set_info.append(
 6947                                    f"""
 6948                                        CASE
 6949                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 6950                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 6951                                            ELSE ''
 6952                                        END
 6953                                    """
 6954                                )
 6955
 6956                        # Merge PZfields
 6957                        sql_set_info_option = ""
 6958                        sql_set_sep = ""
 6959                        for sql_set in sql_set_info:
 6960                            if sql_set_sep:
 6961                                sql_set_info_option += f"""
 6962                                    , concat('{sql_set_sep}', {sql_set})
 6963                                """
 6964                            else:
 6965                                sql_set_info_option += f"""
 6966                                    , {sql_set}
 6967                                """
 6968                            sql_set_sep = ";"
 6969
 6970                        sql_queries = []
 6971                        for annotation in prioritizations_config[profile]:
 6972
 6973                            # Explode specific annotation
 6974                            log.debug(f"Explode annotation '{annotation}'")
 6975                            added_columns += self.explode_infos(
 6976                                prefix=explode_infos_prefix,
 6977                                fields=[annotation],
 6978                                table=table_variants,
 6979                            )
 6980                            extra_infos = self.get_extra_infos(table=table_variants)
 6981
 6982                            # Check if annotation field is present
 6983                            if not f"{explode_infos_prefix}{annotation}" in extra_infos:
 6984                                log.debug(f"Annotation '{annotation}' not in data")
 6985                                continue
 6986                            else:
 6987                                log.debug(f"Annotation '{annotation}' in data")
 6988
 6989                            # For each criterions
 6990                            for criterion in prioritizations_config[profile][
 6991                                annotation
 6992                            ]:
 6993                                criterion_type = criterion["type"]
 6994                                criterion_value = criterion["value"]
 6995                                criterion_score = criterion.get("score", 0)
 6996                                criterion_flag = criterion.get("flag", "PASS")
 6997                                criterion_flag_bool = criterion_flag == "PASS"
 6998                                criterion_comment = (
 6999                                    ", ".join(criterion.get("comment", []))
 7000                                    .replace("'", "''")
 7001                                    .replace(";", ",")
 7002                                    .replace("\t", " ")
 7003                                )
 7004                                criterion_infos = (
 7005                                    str(criterion)
 7006                                    .replace("'", "''")
 7007                                    .replace(";", ",")
 7008                                    .replace("\t", " ")
 7009                                )
 7010
 7011                                sql_set = []
 7012                                sql_set_info = []
 7013
 7014                                # PZ fields set
 7015                                if (
 7016                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7017                                    in list_of_pzfields
 7018                                ):
 7019                                    if prioritization_score_mode == "HOWARD":
 7020                                        sql_set.append(
 7021                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7022                                        )
 7023                                    elif prioritization_score_mode == "VaRank":
 7024                                        sql_set.append(
 7025                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
 7026                                        )
 7027                                    else:
 7028                                        sql_set.append(
 7029                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7030                                        )
 7031                                if (
 7032                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7033                                    in list_of_pzfields
 7034                                ):
 7035                                    sql_set.append(
 7036                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7037                                    )
 7038                                if (
 7039                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7040                                    in list_of_pzfields
 7041                                ):
 7042                                    sql_set.append(
 7043                                        f"""
 7044                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7045                                                concat(
 7046                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7047                                                    CASE 
 7048                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7049                                                        THEN ', '
 7050                                                        ELSE ''
 7051                                                    END,
 7052                                                    '{criterion_comment}'
 7053                                                )
 7054                                        """
 7055                                    )
 7056                                if (
 7057                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7058                                    in list_of_pzfields
 7059                                ):
 7060                                    sql_set.append(
 7061                                        f"""
 7062                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7063                                                concat(
 7064                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7065                                                    '{criterion_infos}'
 7066                                                )
 7067                                        """
 7068                                    )
 7069                                sql_set_option = ",".join(sql_set)
 7070
 7071                                # Criterion and comparison
 7072                                if sql_set_option:
 7073                                    try:
 7074                                        float(criterion_value)
 7075                                        sql_update = f"""
 7076                                            UPDATE {table_variants}
 7077                                            SET {sql_set_option}
 7078                                            WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7079                                            AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7080                                            """
 7081                                    except:
 7082                                        contains_option = ""
 7083                                        if criterion_type == "contains":
 7084                                            contains_option = ".*"
 7085                                        sql_update = f"""
 7086                                            UPDATE {table_variants}
 7087                                            SET {sql_set_option}
 7088                                            WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7089                                            """
 7090                                    sql_queries.append(sql_update)
 7091                                else:
 7092                                    log.warning(
 7093                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7094                                    )
 7095
 7096                        # PZTags
 7097                        if (
 7098                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7099                            in list_of_pzfields
 7100                        ):
 7101
 7102                            # Create PZFalgs value
 7103                            pztags_value = ""
 7104                            pztags_sep_default = "|"
 7105                            pztags_sep = ""
 7106                            for pzfield in pzfields:
 7107                                if pzfield not in [f"{pz_prefix}Tags"]:
 7108                                    if (
 7109                                        f"{pzfield}{pzfields_sep}{profile}"
 7110                                        in list_of_pzfields
 7111                                    ):
 7112                                        if pzfield in [f"{pz_prefix}Flag"]:
 7113                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7114                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7115                                                    THEN 'PASS'
 7116                                                    ELSE 'FILTERED'
 7117                                                END, '"""
 7118                                        else:
 7119                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7120                                        pztags_sep = pztags_sep_default
 7121
 7122                            # Add Query update for PZFlags
 7123                            sql_update_pztags = f"""
 7124                                UPDATE {table_variants}
 7125                                SET INFO = concat(
 7126                                        INFO,
 7127                                        CASE WHEN INFO NOT in ('','.')
 7128                                                THEN ';'
 7129                                                ELSE ''
 7130                                        END,
 7131                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7132                                    )
 7133                                """
 7134                            sql_queries.append(sql_update_pztags)
 7135
 7136                            # Add Query update for PZFlags for default
 7137                            if profile == default_profile:
 7138                                sql_update_pztags_default = f"""
 7139                                UPDATE {table_variants}
 7140                                SET INFO = concat(
 7141                                        INFO,
 7142                                        ';',
 7143                                        '{pz_prefix}Tags={pztags_value}'
 7144                                    )
 7145                                """
 7146                                sql_queries.append(sql_update_pztags_default)
 7147
 7148                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7149
 7150                        if sql_queries:
 7151
 7152                            for sql_query in sql_queries:
 7153                                log.debug(
 7154                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7155                                )
 7156                                self.conn.execute(sql_query)
 7157
 7158                        log.info(f"""Profile '{profile}' - Update... """)
 7159                        sql_query_update = f"""
 7160                            UPDATE {table_variants}
 7161                            SET INFO =  
 7162                                concat(
 7163                                    CASE
 7164                                        WHEN INFO NOT IN ('','.')
 7165                                        THEN concat(INFO, ';')
 7166                                        ELSE ''
 7167                                    END
 7168                                    {sql_set_info_option}
 7169                                )
 7170                        """
 7171                        self.conn.execute(sql_query_update)
 7172
 7173        else:
 7174
 7175            log.warning(f"No profiles in parameters")
 7176
 7177        # Remove added columns
 7178        for added_column in added_columns:
 7179            self.drop_column(column=added_column)
 7180
 7181        # Explode INFOS fields into table fields
 7182        if self.get_explode_infos():
 7183            self.explode_infos(
 7184                prefix=self.get_explode_infos_prefix(),
 7185                fields=self.get_explode_infos_fields(),
 7186                force=True,
 7187            )
 7188
 7189        return True
 7190
 7191    ###
 7192    # HGVS
 7193    ###
 7194
 7195    def annotation_hgvs(self, threads: int = None) -> None:
 7196        """
 7197        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7198        coordinates and alleles.
 7199
 7200        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7201        threads to use for parallel processing. If no value is provided, it will default to the number
 7202        of threads obtained from the `get_threads()` method
 7203        :type threads: int
 7204        """
 7205
 7206        # Function for each partition of the Dask Dataframe
 7207        def partition_function(partition):
 7208            """
 7209            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7210            each row of a DataFrame called `partition`.
 7211
 7212            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7213            to be processed
 7214            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7215            the "partition" dataframe along the axis 1.
 7216            """
 7217            return partition.apply(annotation_hgvs_partition, axis=1)
 7218
 7219        def annotation_hgvs_partition(row) -> str:
 7220            """
 7221            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7222            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7223
 7224            :param row: A dictionary-like object that contains the values for the following keys:
 7225            :return: a string that contains the HGVS names associated with the given row of data.
 7226            """
 7227
 7228            chr = row["CHROM"]
 7229            pos = row["POS"]
 7230            ref = row["REF"]
 7231            alt = row["ALT"]
 7232
 7233            # Find list of associated transcripts
 7234            transcripts_list = list(
 7235                polars_conn.execute(
 7236                    f"""
 7237                SELECT transcript
 7238                FROM refseq_df
 7239                WHERE CHROM='{chr}'
 7240                AND POS={pos}
 7241            """
 7242                )["transcript"]
 7243            )
 7244
 7245            # Full HGVS annotation in list
 7246            hgvs_full_list = []
 7247
 7248            for transcript_name in transcripts_list:
 7249
 7250                # Transcript
 7251                transcript = get_transcript(
 7252                    transcripts=transcripts, transcript_name=transcript_name
 7253                )
 7254                # Exon
 7255                if use_exon:
 7256                    exon = transcript.find_exon_number(pos)
 7257                else:
 7258                    exon = None
 7259                # Protein
 7260                transcript_protein = None
 7261                if use_protein or add_protein or full_format:
 7262                    transcripts_protein = list(
 7263                        polars_conn.execute(
 7264                            f"""
 7265                        SELECT protein
 7266                        FROM refseqlink_df
 7267                        WHERE transcript='{transcript_name}'
 7268                        LIMIT 1
 7269                    """
 7270                        )["protein"]
 7271                    )
 7272                    if len(transcripts_protein):
 7273                        transcript_protein = transcripts_protein[0]
 7274
 7275                # HGVS name
 7276                hgvs_name = format_hgvs_name(
 7277                    chr,
 7278                    pos,
 7279                    ref,
 7280                    alt,
 7281                    genome=genome,
 7282                    transcript=transcript,
 7283                    transcript_protein=transcript_protein,
 7284                    exon=exon,
 7285                    use_gene=use_gene,
 7286                    use_protein=use_protein,
 7287                    full_format=full_format,
 7288                    use_version=use_version,
 7289                    codon_type=codon_type,
 7290                )
 7291                hgvs_full_list.append(hgvs_name)
 7292                if add_protein and not use_protein and not full_format:
 7293                    hgvs_name = format_hgvs_name(
 7294                        chr,
 7295                        pos,
 7296                        ref,
 7297                        alt,
 7298                        genome=genome,
 7299                        transcript=transcript,
 7300                        transcript_protein=transcript_protein,
 7301                        exon=exon,
 7302                        use_gene=use_gene,
 7303                        use_protein=True,
 7304                        full_format=False,
 7305                        use_version=use_version,
 7306                        codon_type=codon_type,
 7307                    )
 7308                    hgvs_full_list.append(hgvs_name)
 7309
 7310            # Create liste of HGVS annotations
 7311            hgvs_full = ",".join(hgvs_full_list)
 7312
 7313            return hgvs_full
 7314
 7315        # Polars connexion
 7316        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7317
 7318        # Config
 7319        config = self.get_config()
 7320
 7321        # Databases
 7322        # Genome
 7323        databases_genomes_folders = (
 7324            config.get("folders", {})
 7325            .get("databases", {})
 7326            .get("genomes", DEFAULT_GENOME_FOLDER)
 7327        )
 7328        databases_genome = (
 7329            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7330        )
 7331        # refseq database folder
 7332        databases_refseq_folders = (
 7333            config.get("folders", {})
 7334            .get("databases", {})
 7335            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7336        )
 7337        # refseq
 7338        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7339        # refSeqLink
 7340        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7341
 7342        # Param
 7343        param = self.get_param()
 7344
 7345        # Quick HGVS
 7346        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7347            log.info(f"Quick HGVS Annotation:")
 7348            if not param.get("hgvs", None):
 7349                param["hgvs"] = {}
 7350            for option in param.get("hgvs_options", "").split(","):
 7351                option_var_val = option.split("=")
 7352                option_var = option_var_val[0]
 7353                if len(option_var_val) > 1:
 7354                    option_val = option_var_val[1]
 7355                else:
 7356                    option_val = "True"
 7357                if option_val.upper() in ["TRUE"]:
 7358                    option_val = True
 7359                elif option_val.upper() in ["FALSE"]:
 7360                    option_val = False
 7361                log.info(f"   {option_var}={option_val}")
 7362                param["hgvs"][option_var] = option_val
 7363
 7364        # Check if HGVS annotation enabled
 7365        if "hgvs" in param:
 7366            log.info(f"HGVS Annotation... ")
 7367            for hgvs_option in param.get("hgvs", {}):
 7368                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7369        else:
 7370            return
 7371
 7372        # HGVS Param
 7373        param_hgvs = param.get("hgvs", {})
 7374        use_exon = param_hgvs.get("use_exon", False)
 7375        use_gene = param_hgvs.get("use_gene", False)
 7376        use_protein = param_hgvs.get("use_protein", False)
 7377        add_protein = param_hgvs.get("add_protein", False)
 7378        full_format = param_hgvs.get("full_format", False)
 7379        use_version = param_hgvs.get("use_version", False)
 7380        codon_type = param_hgvs.get("codon_type", "3")
 7381
 7382        # refSseq refSeqLink
 7383        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 7384        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 7385
 7386        # Assembly
 7387        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 7388
 7389        # Genome
 7390        genome_file = None
 7391        if find_genome(databases_genome):
 7392            genome_file = find_genome(databases_genome)
 7393        else:
 7394            genome_file = find_genome(
 7395                genome_path=databases_genomes_folders, assembly=assembly
 7396            )
 7397        log.debug("Genome: " + str(genome_file))
 7398
 7399        # refSseq
 7400        refseq_file = find_file_prefix(
 7401            input_file=databases_refseq,
 7402            prefix="ncbiRefSeq",
 7403            folder=databases_refseq_folders,
 7404            assembly=assembly,
 7405        )
 7406        log.debug("refSeq: " + str(refseq_file))
 7407
 7408        # refSeqLink
 7409        refseqlink_file = find_file_prefix(
 7410            input_file=databases_refseqlink,
 7411            prefix="ncbiRefSeqLink",
 7412            folder=databases_refseq_folders,
 7413            assembly=assembly,
 7414        )
 7415        log.debug("refSeqLink: " + str(refseqlink_file))
 7416
 7417        # Threads
 7418        if not threads:
 7419            threads = self.get_threads()
 7420        log.debug("Threads: " + str(threads))
 7421
 7422        # Variables
 7423        table_variants = self.get_table_variants(clause="update")
 7424
 7425        # Get variants SNV and InDel only
 7426        query_variants = f"""
 7427            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 7428            FROM {table_variants}
 7429            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 7430            """
 7431        df_variants = self.get_query_to_df(query_variants)
 7432
 7433        # Added columns
 7434        added_columns = []
 7435
 7436        # Add hgvs column in variants table
 7437        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 7438        added_column = self.add_column(
 7439            table_variants, hgvs_column_name, "STRING", default_value=None
 7440        )
 7441        added_columns.append(added_column)
 7442
 7443        log.debug(f"refSeq loading...")
 7444        # refSeq in duckDB
 7445        refseq_table = get_refseq_table(
 7446            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 7447        )
 7448        # Loading all refSeq in Dataframe
 7449        refseq_query = f"""
 7450            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 7451            FROM {refseq_table}
 7452            JOIN df_variants ON (
 7453                {refseq_table}.chrom = df_variants.CHROM
 7454                AND {refseq_table}.txStart<=df_variants.POS
 7455                AND {refseq_table}.txEnd>=df_variants.POS
 7456            )
 7457        """
 7458        refseq_df = self.conn.query(refseq_query).pl()
 7459
 7460        if refseqlink_file:
 7461            log.debug(f"refSeqLink loading...")
 7462            # refSeqLink in duckDB
 7463            refseqlink_table = get_refseq_table(
 7464                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 7465            )
 7466            # Loading all refSeqLink in Dataframe
 7467            protacc_column = "protAcc_with_ver"
 7468            mrnaacc_column = "mrnaAcc_with_ver"
 7469            refseqlink_query = f"""
 7470                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 7471                FROM {refseqlink_table} 
 7472                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 7473                WHERE protAcc_without_ver IS NOT NULL
 7474            """
 7475            # Polars Dataframe
 7476            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 7477
 7478        # Read RefSeq transcripts into a python dict/model.
 7479        log.debug(f"Transcripts loading...")
 7480        with tempfile.TemporaryDirectory() as tmpdir:
 7481            transcripts_query = f"""
 7482                COPY (
 7483                    SELECT {refseq_table}.*
 7484                    FROM {refseq_table}
 7485                    JOIN df_variants ON (
 7486                        {refseq_table}.chrom=df_variants.CHROM
 7487                        AND {refseq_table}.txStart<=df_variants.POS
 7488                        AND {refseq_table}.txEnd>=df_variants.POS
 7489                    )
 7490                )
 7491                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 7492            """
 7493            self.conn.query(transcripts_query)
 7494            with open(f"{tmpdir}/transcript.tsv") as infile:
 7495                transcripts = read_transcripts(infile)
 7496
 7497        # Polars connexion
 7498        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7499
 7500        log.debug("Genome loading...")
 7501        # Read genome sequence using pyfaidx.
 7502        genome = Fasta(genome_file)
 7503
 7504        log.debug("Start annotation HGVS...")
 7505
 7506        # Create
 7507        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 7508        ddf = dd.from_pandas(df_variants, npartitions=threads)
 7509
 7510        # Use dask.dataframe.apply() to apply function on each partition
 7511        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 7512
 7513        # Convert Dask DataFrame to Pandas Dataframe
 7514        df = ddf.compute()
 7515
 7516        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 7517        with tempfile.TemporaryDirectory() as tmpdir:
 7518            df_parquet = os.path.join(tmpdir, "df.parquet")
 7519            df.to_parquet(df_parquet)
 7520
 7521            # Update hgvs column
 7522            update_variant_query = f"""
 7523                UPDATE {table_variants}
 7524                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 7525                FROM read_parquet('{df_parquet}') as df
 7526                WHERE variants."#CHROM" = df.CHROM
 7527                AND variants.POS = df.POS
 7528                AND variants.REF = df.REF
 7529                AND variants.ALT = df.ALT
 7530                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 7531                """
 7532            self.execute_query(update_variant_query)
 7533
 7534        # Update INFO column
 7535        sql_query_update = f"""
 7536            UPDATE {table_variants}
 7537            SET INFO = 
 7538                concat(
 7539                    CASE 
 7540                        WHEN INFO NOT IN ('','.')
 7541                        THEN concat(INFO, ';')
 7542                        ELSE ''
 7543                    END,
 7544                    'hgvs=',
 7545                    {hgvs_column_name}
 7546                )
 7547            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 7548            """
 7549        self.execute_query(sql_query_update)
 7550
 7551        # Add header
 7552        HGVS_INFOS = {
 7553            "hgvs": {
 7554                "ID": "hgvs",
 7555                "Number": ".",
 7556                "Type": "String",
 7557                "Description": f"HGVS annotatation with HOWARD",
 7558            }
 7559        }
 7560
 7561        for field in HGVS_INFOS:
 7562            field_ID = HGVS_INFOS[field]["ID"]
 7563            field_description = HGVS_INFOS[field]["Description"]
 7564            self.get_header().infos[field_ID] = vcf.parser._Info(
 7565                field_ID,
 7566                HGVS_INFOS[field]["Number"],
 7567                HGVS_INFOS[field]["Type"],
 7568                field_description,
 7569                "unknown",
 7570                "unknown",
 7571                code_type_map[HGVS_INFOS[field]["Type"]],
 7572            )
 7573
 7574        # Remove added columns
 7575        for added_column in added_columns:
 7576            self.drop_column(column=added_column)
 7577
 7578    ###
 7579    # Calculation
 7580    ###
 7581
 7582    def get_operations_help(
 7583        self, operations_config_dict: dict = {}, operations_config_file: str = None
 7584    ) -> list:
 7585
 7586        # Init
 7587        operations_help = []
 7588
 7589        # operations
 7590        operations = self.get_config_json(
 7591            name="calculations",
 7592            config_dict=operations_config_dict,
 7593            config_file=operations_config_file,
 7594        )
 7595        for op in operations:
 7596            op_name = operations[op].get("name", op).upper()
 7597            op_description = operations[op].get("description", op_name)
 7598            op_available = operations[op].get("available", False)
 7599            if op_available:
 7600                operations_help.append(f"   {op_name}: {op_description}")
 7601
 7602        # Sort operations
 7603        operations_help.sort()
 7604
 7605        # insert header
 7606        operations_help.insert(0, "Available calculation operations:")
 7607
 7608        # Return
 7609        return operations_help
 7610
 7611    def calculation(
 7612        self,
 7613        operations: dict = {},
 7614        operations_config_dict: dict = {},
 7615        operations_config_file: str = None,
 7616    ) -> None:
 7617        """
 7618        It takes a list of operations, and for each operation, it checks if it's a python or sql
 7619        operation, and then calls the appropriate function
 7620
 7621        param json example:
 7622            "calculation": {
 7623                "NOMEN": {
 7624                    "options": {
 7625                        "hgvs_field": "hgvs"
 7626                    },
 7627                "middle" : null
 7628            }
 7629        """
 7630
 7631        # Param
 7632        param = self.get_param()
 7633
 7634        # operations config
 7635        operations_config = self.get_config_json(
 7636            name="calculations",
 7637            config_dict=operations_config_dict,
 7638            config_file=operations_config_file,
 7639        )
 7640
 7641        # Upper keys
 7642        operations_config = {k.upper(): v for k, v in operations_config.items()}
 7643
 7644        # Calculations
 7645
 7646        # Operations from param
 7647        operations = param.get("calculation", {}).get("calculations", operations)
 7648
 7649        # Quick calculation - add
 7650        if param.get("calculations", None):
 7651            calculations_list = [
 7652                value for value in param.get("calculations", "").split(",")
 7653            ]
 7654            log.info(f"Quick Calculations:")
 7655            for calculation_key in calculations_list:
 7656                log.info(f"   {calculation_key}")
 7657            for calculation_operation in calculations_list:
 7658                if calculation_operation.upper() not in operations:
 7659                    operations[calculation_operation.upper()] = {}
 7660                    add_value_into_dict(
 7661                        dict_tree=param,
 7662                        sections=[
 7663                            "calculation",
 7664                            "calculations",
 7665                            calculation_operation.upper(),
 7666                        ],
 7667                        value={},
 7668                    )
 7669
 7670        # Operations for calculation
 7671        if not operations:
 7672            operations = param.get("calculation", {}).get("calculations", {})
 7673
 7674        if operations:
 7675            log.info(f"Calculations...")
 7676
 7677        # For each operations
 7678        for operation_name in operations:
 7679            operation_name = operation_name.upper()
 7680            if operation_name not in [""]:
 7681                if operation_name in operations_config:
 7682                    log.info(f"Calculation '{operation_name}'")
 7683                    operation = operations_config[operation_name]
 7684                    operation_type = operation.get("type", "sql")
 7685                    if operation_type == "python":
 7686                        self.calculation_process_function(
 7687                            operation=operation, operation_name=operation_name
 7688                        )
 7689                    elif operation_type == "sql":
 7690                        self.calculation_process_sql(
 7691                            operation=operation, operation_name=operation_name
 7692                        )
 7693                    else:
 7694                        log.error(
 7695                            f"Operations config: Type '{operation_type}' NOT available"
 7696                        )
 7697                        raise ValueError(
 7698                            f"Operations config: Type '{operation_type}' NOT available"
 7699                        )
 7700                else:
 7701                    log.error(
 7702                        f"Operations config: Calculation '{operation_name}' NOT available"
 7703                    )
 7704                    raise ValueError(
 7705                        f"Operations config: Calculation '{operation_name}' NOT available"
 7706                    )
 7707
 7708        # Explode INFOS fields into table fields
 7709        if self.get_explode_infos():
 7710            self.explode_infos(
 7711                prefix=self.get_explode_infos_prefix(),
 7712                fields=self.get_explode_infos_fields(),
 7713                force=True,
 7714            )
 7715
 7716    def calculation_process_sql(
 7717        self, operation: dict, operation_name: str = "unknown"
 7718    ) -> None:
 7719        """
 7720        The `calculation_process_sql` function takes in a mathematical operation as a string and
 7721        performs the operation, updating the specified table with the result.
 7722
 7723        :param operation: The `operation` parameter is a dictionary that contains information about the
 7724        mathematical operation to be performed. It includes the following keys:
 7725        :type operation: dict
 7726        :param operation_name: The `operation_name` parameter is a string that represents the name of
 7727        the mathematical operation being performed. It is used for logging and error handling purposes,
 7728        defaults to unknown
 7729        :type operation_name: str (optional)
 7730        """
 7731
 7732        # table variants
 7733        table_variants = self.get_table_variants(clause="alter")
 7734
 7735        # Operation infos
 7736        operation_name = operation.get("name", "unknown")
 7737        log.debug(f"process sql {operation_name}")
 7738        output_column_name = operation.get("output_column_name", operation_name)
 7739        output_column_type = operation.get("output_column_type", "String")
 7740        prefix = operation.get("explode_infos_prefix", "")
 7741        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 7742        output_column_description = operation.get(
 7743            "output_column_description", f"{operation_name} operation"
 7744        )
 7745        operation_query = operation.get("operation_query", None)
 7746        if isinstance(operation_query, list):
 7747            operation_query = " ".join(operation_query)
 7748        operation_info_fields = operation.get("info_fields", [])
 7749        operation_info_fields_check = operation.get("info_fields_check", False)
 7750        operation_info = operation.get("operation_info", True)
 7751
 7752        if operation_query:
 7753
 7754            # Info fields check
 7755            operation_info_fields_check_result = True
 7756            if operation_info_fields_check:
 7757                header_infos = self.get_header().infos
 7758                for info_field in operation_info_fields:
 7759                    operation_info_fields_check_result = (
 7760                        operation_info_fields_check_result
 7761                        and info_field in header_infos
 7762                    )
 7763
 7764            # If info fields available
 7765            if operation_info_fields_check_result:
 7766
 7767                # Added_columns
 7768                added_columns = []
 7769
 7770                # Create VCF header field
 7771                vcf_reader = self.get_header()
 7772                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 7773                    output_column_name,
 7774                    ".",
 7775                    output_column_type,
 7776                    output_column_description,
 7777                    "howard calculation",
 7778                    "0",
 7779                    self.code_type_map.get(output_column_type),
 7780                )
 7781
 7782                # Explode infos if needed
 7783                log.debug(f"calculation_process_sql prefix {prefix}")
 7784                added_columns += self.explode_infos(
 7785                    prefix=prefix,
 7786                    fields=[output_column_name] + operation_info_fields,
 7787                    force=True,
 7788                )
 7789
 7790                # Create column
 7791                added_column = self.add_column(
 7792                    table_name=table_variants,
 7793                    column_name=prefix + output_column_name,
 7794                    column_type=output_column_type_sql,
 7795                    default_value="null",
 7796                )
 7797                added_columns.append(added_column)
 7798
 7799                # Operation calculation
 7800                try:
 7801
 7802                    # Query to update calculation column
 7803                    sql_update = f"""
 7804                        UPDATE {table_variants}
 7805                        SET "{prefix}{output_column_name}" = ({operation_query})
 7806                    """
 7807                    self.conn.execute(sql_update)
 7808
 7809                    # Add to INFO
 7810                    if operation_info:
 7811                        sql_update_info = f"""
 7812                            UPDATE {table_variants}
 7813                            SET "INFO" =
 7814                                concat(
 7815                                    CASE
 7816                                        WHEN "INFO" IS NOT NULL
 7817                                        THEN concat("INFO", ';')
 7818                                        ELSE ''
 7819                                    END,
 7820                                    '{output_column_name}=',
 7821                                    "{prefix}{output_column_name}"
 7822                                )
 7823                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 7824                        """
 7825                        self.conn.execute(sql_update_info)
 7826
 7827                except:
 7828                    log.error(
 7829                        f"Operations config: Calculation '{operation_name}' query failed"
 7830                    )
 7831                    raise ValueError(
 7832                        f"Operations config: Calculation '{operation_name}' query failed"
 7833                    )
 7834
 7835                # Remove added columns
 7836                for added_column in added_columns:
 7837                    log.debug(f"added_column: {added_column}")
 7838                    self.drop_column(column=added_column)
 7839
 7840            else:
 7841                log.error(
 7842                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 7843                )
 7844                raise ValueError(
 7845                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 7846                )
 7847
 7848        else:
 7849            log.error(
 7850                f"Operations config: Calculation '{operation_name}' query NOT defined"
 7851            )
 7852            raise ValueError(
 7853                f"Operations config: Calculation '{operation_name}' query NOT defined"
 7854            )
 7855
 7856    def calculation_process_function(
 7857        self, operation: dict, operation_name: str = "unknown"
 7858    ) -> None:
 7859        """
 7860        The `calculation_process_function` takes in an operation dictionary and performs the specified
 7861        function with the given parameters.
 7862
 7863        :param operation: The `operation` parameter is a dictionary that contains information about the
 7864        operation to be performed. It has the following keys:
 7865        :type operation: dict
 7866        :param operation_name: The `operation_name` parameter is a string that represents the name of
 7867        the operation being performed. It is used for logging purposes, defaults to unknown
 7868        :type operation_name: str (optional)
 7869        """
 7870
 7871        operation_name = operation["name"]
 7872        log.debug(f"process sql {operation_name}")
 7873        function_name = operation["function_name"]
 7874        function_params = operation["function_params"]
 7875        getattr(self, function_name)(*function_params)
 7876
 7877    def calculation_variant_id(self) -> None:
 7878        """
 7879        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 7880        updates the INFO field of a variants table with the variant ID.
 7881        """
 7882
 7883        # variant_id annotation field
 7884        variant_id_tag = self.get_variant_id_column()
 7885        added_columns = [variant_id_tag]
 7886
 7887        # variant_id hgvs tags"
 7888        vcf_infos_tags = {
 7889            variant_id_tag: "howard variant ID annotation",
 7890        }
 7891
 7892        # Variants table
 7893        table_variants = self.get_table_variants()
 7894
 7895        # Header
 7896        vcf_reader = self.get_header()
 7897
 7898        # Add variant_id to header
 7899        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 7900            variant_id_tag,
 7901            ".",
 7902            "String",
 7903            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 7904            "howard calculation",
 7905            "0",
 7906            self.code_type_map.get("String"),
 7907        )
 7908
 7909        # Update
 7910        sql_update = f"""
 7911            UPDATE {table_variants}
 7912            SET "INFO" = 
 7913                concat(
 7914                    CASE
 7915                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 7916                        THEN ''
 7917                        ELSE concat("INFO", ';')
 7918                    END,
 7919                    '{variant_id_tag}=',
 7920                    "{variant_id_tag}"
 7921                )
 7922        """
 7923        self.conn.execute(sql_update)
 7924
 7925        # Remove added columns
 7926        for added_column in added_columns:
 7927            self.drop_column(column=added_column)
 7928
 7929    def calculation_extract_snpeff_hgvs(
 7930        self,
 7931        snpeff_hgvs: str = "snpeff_hgvs",
 7932        snpeff_field: str = "ANN",
 7933    ) -> None:
 7934        """
 7935        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 7936        annotation field in a VCF file and adds them as a new column in the variants table.
 7937
 7938        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 7939        function is used to specify the name of the column that will store the HGVS nomenclatures
 7940        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 7941        snpeff_hgvs
 7942        :type snpeff_hgvs: str (optional)
 7943        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 7944        function represents the field in the VCF file that contains SnpEff annotations. This field is
 7945        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 7946        to ANN
 7947        :type snpeff_field: str (optional)
 7948        """
 7949
 7950        # Snpeff hgvs tags
 7951        vcf_infos_tags = {
 7952            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 7953        }
 7954
 7955        # Prefix
 7956        prefix = self.get_explode_infos_prefix()
 7957        if prefix:
 7958            prefix = "INFO/"
 7959
 7960        # snpEff fields
 7961        speff_ann_infos = prefix + snpeff_field
 7962        speff_hgvs_infos = prefix + snpeff_hgvs
 7963
 7964        # Variants table
 7965        table_variants = self.get_table_variants()
 7966
 7967        # Header
 7968        vcf_reader = self.get_header()
 7969
 7970        # Add columns
 7971        added_columns = []
 7972
 7973        # Explode HGVS field in column
 7974        added_columns += self.explode_infos(fields=[snpeff_field])
 7975
 7976        if snpeff_field in vcf_reader.infos:
 7977
 7978            log.debug(vcf_reader.infos[snpeff_field])
 7979
 7980            # Extract ANN header
 7981            ann_description = vcf_reader.infos[snpeff_field].desc
 7982            pattern = r"'(.+?)'"
 7983            match = re.search(pattern, ann_description)
 7984            if match:
 7985                ann_header_match = match.group(1).split(" | ")
 7986                ann_header_desc = {}
 7987                for i in range(len(ann_header_match)):
 7988                    ann_header_info = "".join(
 7989                        char for char in ann_header_match[i] if char.isalnum()
 7990                    )
 7991                    ann_header_desc[ann_header_info] = ann_header_match[i]
 7992                if not ann_header_desc:
 7993                    raise ValueError("Invalid header description format")
 7994            else:
 7995                raise ValueError("Invalid header description format")
 7996
 7997            # Create variant id
 7998            variant_id_column = self.get_variant_id_column()
 7999            added_columns += [variant_id_column]
 8000
 8001            # Create dataframe
 8002            dataframe_snpeff_hgvs = self.get_query_to_df(
 8003                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8004            )
 8005
 8006            # Create main NOMEN column
 8007            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8008                speff_ann_infos
 8009            ].apply(
 8010                lambda x: extract_snpeff_hgvs(
 8011                    str(x), header=list(ann_header_desc.values())
 8012                )
 8013            )
 8014
 8015            # Add snpeff_hgvs to header
 8016            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8017                snpeff_hgvs,
 8018                ".",
 8019                "String",
 8020                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8021                "howard calculation",
 8022                "0",
 8023                self.code_type_map.get("String"),
 8024            )
 8025
 8026            # Update
 8027            sql_update = f"""
 8028                UPDATE variants
 8029                SET "INFO" = 
 8030                    concat(
 8031                        CASE
 8032                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8033                            THEN ''
 8034                            ELSE concat("INFO", ';')
 8035                        END,
 8036                        CASE 
 8037                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8038                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8039                            THEN concat(
 8040                                    '{snpeff_hgvs}=',
 8041                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8042                                )
 8043                            ELSE ''
 8044                        END
 8045                    )
 8046                FROM dataframe_snpeff_hgvs
 8047                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8048
 8049            """
 8050            self.conn.execute(sql_update)
 8051
 8052            # Delete dataframe
 8053            del dataframe_snpeff_hgvs
 8054            gc.collect()
 8055
 8056        else:
 8057
 8058            log.warning(
 8059                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8060            )
 8061
 8062        # Remove added columns
 8063        for added_column in added_columns:
 8064            self.drop_column(column=added_column)
 8065
 8066    def calculation_snpeff_ann_explode(
 8067        self,
 8068        uniquify: bool = True,
 8069        output_format: str = "fields",
 8070        output_prefix: str = "snpeff_",
 8071        snpeff_field: str = "ANN",
 8072    ) -> None:
 8073        """
 8074        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8075        exploding the HGVS field and updating variant information accordingly.
 8076
 8077        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8078        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8079        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8080        defaults to True
 8081        :type uniquify: bool (optional)
 8082        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8083        function specifies the format in which the output annotations will be generated. It has a
 8084        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8085        format, defaults to fields
 8086        :type output_format: str (optional)
 8087        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8088        method is used to specify the prefix that will be added to the output annotations generated
 8089        during the calculation process. This prefix helps to differentiate the newly added annotations
 8090        from existing ones in the output data. By default, the, defaults to ANN_
 8091        :type output_prefix: str (optional)
 8092        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8093        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8094        field will be processed to explode the HGVS annotations and update the variant information
 8095        accordingly, defaults to ANN
 8096        :type snpeff_field: str (optional)
 8097        """
 8098
 8099        # SnpEff annotation field
 8100        snpeff_hgvs = "snpeff_ann_explode"
 8101
 8102        # Snpeff hgvs tags
 8103        vcf_infos_tags = {
 8104            snpeff_hgvs: "Explode snpEff annotations",
 8105        }
 8106
 8107        # Prefix
 8108        prefix = self.get_explode_infos_prefix()
 8109        if prefix:
 8110            prefix = "INFO/"
 8111
 8112        # snpEff fields
 8113        speff_ann_infos = prefix + snpeff_field
 8114        speff_hgvs_infos = prefix + snpeff_hgvs
 8115
 8116        # Variants table
 8117        table_variants = self.get_table_variants()
 8118
 8119        # Header
 8120        vcf_reader = self.get_header()
 8121
 8122        # Add columns
 8123        added_columns = []
 8124
 8125        # Explode HGVS field in column
 8126        added_columns += self.explode_infos(fields=[snpeff_field])
 8127        log.debug(f"snpeff_field={snpeff_field}")
 8128        log.debug(f"added_columns={added_columns}")
 8129
 8130        if snpeff_field in vcf_reader.infos:
 8131
 8132            # Extract ANN header
 8133            ann_description = vcf_reader.infos[snpeff_field].desc
 8134            pattern = r"'(.+?)'"
 8135            match = re.search(pattern, ann_description)
 8136            if match:
 8137                ann_header_match = match.group(1).split(" | ")
 8138                ann_header = []
 8139                ann_header_desc = {}
 8140                for i in range(len(ann_header_match)):
 8141                    ann_header_info = "".join(
 8142                        char for char in ann_header_match[i] if char.isalnum()
 8143                    )
 8144                    ann_header.append(ann_header_info)
 8145                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8146                if not ann_header_desc:
 8147                    raise ValueError("Invalid header description format")
 8148            else:
 8149                raise ValueError("Invalid header description format")
 8150
 8151            # Create variant id
 8152            variant_id_column = self.get_variant_id_column()
 8153            added_columns += [variant_id_column]
 8154
 8155            # Create dataframe
 8156            dataframe_snpeff_hgvs = self.get_query_to_df(
 8157                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8158            )
 8159
 8160            # Create snpEff columns
 8161            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8162                speff_ann_infos
 8163            ].apply(
 8164                lambda x: explode_snpeff_ann(
 8165                    str(x),
 8166                    uniquify=uniquify,
 8167                    output_format=output_format,
 8168                    prefix=output_prefix,
 8169                    header=list(ann_header_desc.values()),
 8170                )
 8171            )
 8172
 8173            # Header
 8174            ann_annotations_prefix = ""
 8175            if output_format.upper() in ["JSON"]:
 8176                ann_annotations_prefix = f"{output_prefix}="
 8177                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8178                    output_prefix,
 8179                    ".",
 8180                    "String",
 8181                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8182                    + " - JSON format",
 8183                    "howard calculation",
 8184                    "0",
 8185                    self.code_type_map.get("String"),
 8186                )
 8187            else:
 8188                for ann_annotation in ann_header:
 8189                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8190                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8191                        ann_annotation_id,
 8192                        ".",
 8193                        "String",
 8194                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8195                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8196                        "howard calculation",
 8197                        "0",
 8198                        self.code_type_map.get("String"),
 8199                    )
 8200
 8201            # Update
 8202            sql_update = f"""
 8203                UPDATE variants
 8204                SET "INFO" = 
 8205                    concat(
 8206                        CASE
 8207                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8208                            THEN ''
 8209                            ELSE concat("INFO", ';')
 8210                        END,
 8211                        CASE 
 8212                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8213                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8214                            THEN concat(
 8215                                '{ann_annotations_prefix}',
 8216                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8217                                )
 8218                            ELSE ''
 8219                        END
 8220                    )
 8221                FROM dataframe_snpeff_hgvs
 8222                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8223
 8224            """
 8225            self.conn.execute(sql_update)
 8226
 8227            # Delete dataframe
 8228            del dataframe_snpeff_hgvs
 8229            gc.collect()
 8230
 8231        else:
 8232
 8233            log.warning(
 8234                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8235            )
 8236
 8237        # Remove added columns
 8238        for added_column in added_columns:
 8239            self.drop_column(column=added_column)
 8240
 8241    def calculation_extract_nomen(self) -> None:
 8242        """
 8243        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8244        """
 8245
 8246        # NOMEN field
 8247        field_nomen_dict = "NOMEN_DICT"
 8248
 8249        # NOMEN structure
 8250        nomen_dict = {
 8251            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8252            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8253            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8254            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8255            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8256            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8257            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8258            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8259            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8260            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8261        }
 8262
 8263        # Param
 8264        param = self.get_param()
 8265
 8266        # Prefix
 8267        prefix = self.get_explode_infos_prefix()
 8268
 8269        # Header
 8270        vcf_reader = self.get_header()
 8271
 8272        # Get HGVS field
 8273        hgvs_field = (
 8274            param.get("calculation", {})
 8275            .get("calculations", {})
 8276            .get("NOMEN", {})
 8277            .get("options", {})
 8278            .get("hgvs_field", "hgvs")
 8279        )
 8280
 8281        # Get transcripts
 8282        transcripts_file = (
 8283            param.get("calculation", {})
 8284            .get("calculations", {})
 8285            .get("NOMEN", {})
 8286            .get("options", {})
 8287            .get("transcripts", None)
 8288        )
 8289        transcripts_file = full_path(transcripts_file)
 8290        transcripts = []
 8291        if transcripts_file:
 8292            if os.path.exists(transcripts_file):
 8293                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8294                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
 8295            else:
 8296                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
 8297                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
 8298
 8299        # Added columns
 8300        added_columns = []
 8301
 8302        # Explode HGVS field in column
 8303        added_columns += self.explode_infos(fields=[hgvs_field])
 8304
 8305        # extra infos
 8306        extra_infos = self.get_extra_infos()
 8307        extra_field = prefix + hgvs_field
 8308
 8309        if extra_field in extra_infos:
 8310
 8311            # Create dataframe
 8312            dataframe_hgvs = self.get_query_to_df(
 8313                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
 8314            )
 8315
 8316            # Create main NOMEN column
 8317            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
 8318                lambda x: find_nomen(str(x), transcripts=transcripts)
 8319            )
 8320
 8321            # Explode NOMEN Structure and create SQL set for update
 8322            sql_nomen_fields = []
 8323            for nomen_field in nomen_dict:
 8324
 8325                # Explode each field into a column
 8326                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
 8327                    lambda x: dict(x).get(nomen_field, "")
 8328                )
 8329
 8330                # Create VCF header field
 8331                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 8332                    nomen_field,
 8333                    ".",
 8334                    "String",
 8335                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 8336                    "howard calculation",
 8337                    "0",
 8338                    self.code_type_map.get("String"),
 8339                )
 8340                sql_nomen_fields.append(
 8341                    f"""
 8342                        CASE 
 8343                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
 8344                            THEN concat(
 8345                                    ';{nomen_field}=',
 8346                                    dataframe_hgvs."{nomen_field}"
 8347                                )
 8348                            ELSE ''
 8349                        END
 8350                    """
 8351                )
 8352
 8353            # SQL set for update
 8354            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 8355
 8356            # Update
 8357            sql_update = f"""
 8358                UPDATE variants
 8359                SET "INFO" = 
 8360                    concat(
 8361                        CASE
 8362                            WHEN "INFO" IS NULL
 8363                            THEN ''
 8364                            ELSE "INFO"
 8365                        END,
 8366                        {sql_nomen_fields_set}
 8367                    )
 8368                FROM dataframe_hgvs
 8369                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 8370                    AND variants."POS" = dataframe_hgvs."POS" 
 8371                    AND variants."REF" = dataframe_hgvs."REF"
 8372                    AND variants."ALT" = dataframe_hgvs."ALT"
 8373            """
 8374            self.conn.execute(sql_update)
 8375
 8376            # Delete dataframe
 8377            del dataframe_hgvs
 8378            gc.collect()
 8379
 8380        # Remove added columns
 8381        for added_column in added_columns:
 8382            self.drop_column(column=added_column)
 8383
 8384    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 8385        """
 8386        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 8387        pipeline/sample for a variant and updates the variant information in a VCF file.
 8388
 8389        :param tag: The `tag` parameter is a string that represents the annotation field for the
 8390        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 8391        VCF header and to update the corresponding field in the variants table, defaults to
 8392        findbypipeline
 8393        :type tag: str (optional)
 8394        """
 8395
 8396        # if FORMAT and samples
 8397        if (
 8398            "FORMAT" in self.get_header_columns_as_list()
 8399            and self.get_header_sample_list()
 8400        ):
 8401
 8402            # findbypipeline annotation field
 8403            findbypipeline_tag = tag
 8404
 8405            # VCF infos tags
 8406            vcf_infos_tags = {
 8407                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 8408            }
 8409
 8410            # Prefix
 8411            prefix = self.get_explode_infos_prefix()
 8412
 8413            # Field
 8414            findbypipeline_infos = prefix + findbypipeline_tag
 8415
 8416            # Variants table
 8417            table_variants = self.get_table_variants()
 8418
 8419            # Header
 8420            vcf_reader = self.get_header()
 8421
 8422            # Create variant id
 8423            variant_id_column = self.get_variant_id_column()
 8424            added_columns = [variant_id_column]
 8425
 8426            # variant_id, FORMAT and samples
 8427            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8428                self.get_header_sample_list()
 8429            )
 8430
 8431            # Create dataframe
 8432            dataframe_findbypipeline = self.get_query_to_df(
 8433                f""" SELECT {samples_fields} FROM {table_variants} """
 8434            )
 8435
 8436            # Create findbypipeline column
 8437            dataframe_findbypipeline[findbypipeline_infos] = (
 8438                dataframe_findbypipeline.apply(
 8439                    lambda row: findbypipeline(
 8440                        row, samples=self.get_header_sample_list()
 8441                    ),
 8442                    axis=1,
 8443                )
 8444            )
 8445
 8446            # Add snpeff_hgvs to header
 8447            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 8448                findbypipeline_tag,
 8449                ".",
 8450                "String",
 8451                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 8452                "howard calculation",
 8453                "0",
 8454                self.code_type_map.get("String"),
 8455            )
 8456
 8457            # Update
 8458            sql_update = f"""
 8459                UPDATE variants
 8460                SET "INFO" = 
 8461                    concat(
 8462                        CASE
 8463                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8464                            THEN ''
 8465                            ELSE concat("INFO", ';')
 8466                        END,
 8467                        CASE 
 8468                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 8469                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 8470                            THEN concat(
 8471                                    '{findbypipeline_tag}=',
 8472                                    dataframe_findbypipeline."{findbypipeline_infos}"
 8473                                )
 8474                            ELSE ''
 8475                        END
 8476                    )
 8477                FROM dataframe_findbypipeline
 8478                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 8479            """
 8480            self.conn.execute(sql_update)
 8481
 8482            # Remove added columns
 8483            for added_column in added_columns:
 8484                self.drop_column(column=added_column)
 8485
 8486            # Delete dataframe
 8487            del dataframe_findbypipeline
 8488            gc.collect()
 8489
 8490    def calculation_genotype_concordance(self) -> None:
 8491        """
 8492        The function `calculation_genotype_concordance` calculates the genotype concordance for
 8493        multi-caller VCF files and updates the variant information in the database.
 8494        """
 8495
 8496        # if FORMAT and samples
 8497        if (
 8498            "FORMAT" in self.get_header_columns_as_list()
 8499            and self.get_header_sample_list()
 8500        ):
 8501
 8502            # genotypeconcordance annotation field
 8503            genotypeconcordance_tag = "genotypeconcordance"
 8504
 8505            # VCF infos tags
 8506            vcf_infos_tags = {
 8507                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 8508            }
 8509
 8510            # Prefix
 8511            prefix = self.get_explode_infos_prefix()
 8512
 8513            # Field
 8514            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 8515
 8516            # Variants table
 8517            table_variants = self.get_table_variants()
 8518
 8519            # Header
 8520            vcf_reader = self.get_header()
 8521
 8522            # Create variant id
 8523            variant_id_column = self.get_variant_id_column()
 8524            added_columns = [variant_id_column]
 8525
 8526            # variant_id, FORMAT and samples
 8527            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8528                self.get_header_sample_list()
 8529            )
 8530
 8531            # Create dataframe
 8532            dataframe_genotypeconcordance = self.get_query_to_df(
 8533                f""" SELECT {samples_fields} FROM {table_variants} """
 8534            )
 8535
 8536            # Create genotypeconcordance column
 8537            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 8538                dataframe_genotypeconcordance.apply(
 8539                    lambda row: genotypeconcordance(
 8540                        row, samples=self.get_header_sample_list()
 8541                    ),
 8542                    axis=1,
 8543                )
 8544            )
 8545
 8546            # Add genotypeconcordance to header
 8547            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 8548                genotypeconcordance_tag,
 8549                ".",
 8550                "String",
 8551                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 8552                "howard calculation",
 8553                "0",
 8554                self.code_type_map.get("String"),
 8555            )
 8556
 8557            # Update
 8558            sql_update = f"""
 8559                UPDATE variants
 8560                SET "INFO" = 
 8561                    concat(
 8562                        CASE
 8563                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8564                            THEN ''
 8565                            ELSE concat("INFO", ';')
 8566                        END,
 8567                        CASE
 8568                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 8569                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 8570                            THEN concat(
 8571                                    '{genotypeconcordance_tag}=',
 8572                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 8573                                )
 8574                            ELSE ''
 8575                        END
 8576                    )
 8577                FROM dataframe_genotypeconcordance
 8578                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 8579            """
 8580            self.conn.execute(sql_update)
 8581
 8582            # Remove added columns
 8583            for added_column in added_columns:
 8584                self.drop_column(column=added_column)
 8585
 8586            # Delete dataframe
 8587            del dataframe_genotypeconcordance
 8588            gc.collect()
 8589
 8590    def calculation_barcode(self, tag: str = "barcode") -> None:
 8591        """
 8592        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 8593        updates the INFO field in the file with the calculated barcode values.
 8594
 8595        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 8596        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 8597        the default tag name is set to "barcode", defaults to barcode
 8598        :type tag: str (optional)
 8599        """
 8600
 8601        # if FORMAT and samples
 8602        if (
 8603            "FORMAT" in self.get_header_columns_as_list()
 8604            and self.get_header_sample_list()
 8605        ):
 8606
 8607            # barcode annotation field
 8608            if not tag:
 8609                tag = "barcode"
 8610
 8611            # VCF infos tags
 8612            vcf_infos_tags = {
 8613                tag: "barcode calculation (VaRank)",
 8614            }
 8615
 8616            # Prefix
 8617            prefix = self.get_explode_infos_prefix()
 8618
 8619            # Field
 8620            barcode_infos = prefix + tag
 8621
 8622            # Variants table
 8623            table_variants = self.get_table_variants()
 8624
 8625            # Header
 8626            vcf_reader = self.get_header()
 8627
 8628            # Create variant id
 8629            variant_id_column = self.get_variant_id_column()
 8630            added_columns = [variant_id_column]
 8631
 8632            # variant_id, FORMAT and samples
 8633            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8634                self.get_header_sample_list()
 8635            )
 8636
 8637            # Create dataframe
 8638            dataframe_barcode = self.get_query_to_df(
 8639                f""" SELECT {samples_fields} FROM {table_variants} """
 8640            )
 8641
 8642            # Create barcode column
 8643            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 8644                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 8645            )
 8646
 8647            # Add barcode to header
 8648            vcf_reader.infos[tag] = vcf.parser._Info(
 8649                tag,
 8650                ".",
 8651                "String",
 8652                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 8653                "howard calculation",
 8654                "0",
 8655                self.code_type_map.get("String"),
 8656            )
 8657
 8658            # Update
 8659            sql_update = f"""
 8660                UPDATE {table_variants}
 8661                SET "INFO" = 
 8662                    concat(
 8663                        CASE
 8664                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8665                            THEN ''
 8666                            ELSE concat("INFO", ';')
 8667                        END,
 8668                        CASE
 8669                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 8670                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 8671                            THEN concat(
 8672                                    '{tag}=',
 8673                                    dataframe_barcode."{barcode_infos}"
 8674                                )
 8675                            ELSE ''
 8676                        END
 8677                    )
 8678                FROM dataframe_barcode
 8679                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 8680            """
 8681            self.conn.execute(sql_update)
 8682
 8683            # Remove added columns
 8684            for added_column in added_columns:
 8685                self.drop_column(column=added_column)
 8686
 8687            # Delete dataframe
 8688            del dataframe_barcode
 8689            gc.collect()
 8690
 8691    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 8692        """
 8693        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 8694        and updates the INFO field in the file with the calculated barcode values.
 8695
 8696        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 8697        the barcode tag that will be added to the VCF file during the calculation process. If no value
 8698        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 8699        :type tag: str (optional)
 8700        """
 8701
 8702        # if FORMAT and samples
 8703        if (
 8704            "FORMAT" in self.get_header_columns_as_list()
 8705            and self.get_header_sample_list()
 8706        ):
 8707
 8708            # barcode annotation field
 8709            if not tag:
 8710                tag = "BCF"
 8711
 8712            # VCF infos tags
 8713            vcf_infos_tags = {
 8714                tag: "barcode family calculation",
 8715                f"{tag}S": "barcode family samples",
 8716            }
 8717
 8718            # Param
 8719            param = self.get_param()
 8720            log.debug(f"param={param}")
 8721
 8722            # Prefix
 8723            prefix = self.get_explode_infos_prefix()
 8724
 8725            # PED param
 8726            ped = (
 8727                param.get("calculation", {})
 8728                .get("calculations", {})
 8729                .get("BARCODEFAMILY", {})
 8730                .get("family_pedigree", None)
 8731            )
 8732            log.debug(f"ped={ped}")
 8733
 8734            # Load PED
 8735            if ped:
 8736
 8737                # Pedigree is a file
 8738                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 8739                    log.debug("Pedigree is file")
 8740                    with open(full_path(ped)) as ped:
 8741                        ped = json.load(ped)
 8742
 8743                # Pedigree is a string
 8744                elif isinstance(ped, str):
 8745                    log.debug("Pedigree is str")
 8746                    try:
 8747                        ped = json.loads(ped)
 8748                        log.debug("Pedigree is json str")
 8749                    except ValueError as e:
 8750                        ped_samples = ped.split(",")
 8751                        ped = {}
 8752                        for ped_sample in ped_samples:
 8753                            ped[ped_sample] = ped_sample
 8754
 8755                # Pedigree is a dict
 8756                elif isinstance(ped, dict):
 8757                    log.debug("Pedigree is dict")
 8758
 8759                # Pedigree is not well formatted
 8760                else:
 8761                    msg_error = "Pedigree not well formatted"
 8762                    log.error(msg_error)
 8763                    raise ValueError(msg_error)
 8764
 8765                # Construct list
 8766                ped_samples = list(ped.values())
 8767
 8768            else:
 8769                log.debug("Pedigree not defined. Take all samples")
 8770                ped_samples = self.get_header_sample_list()
 8771                ped = {}
 8772                for ped_sample in ped_samples:
 8773                    ped[ped_sample] = ped_sample
 8774
 8775            # Check pedigree
 8776            if not ped or len(ped) == 0:
 8777                msg_error = f"Error in pedigree: samples {ped_samples}"
 8778                log.error(msg_error)
 8779                raise ValueError(msg_error)
 8780
 8781            # Log
 8782            log.info(
 8783                "Calculation 'BARCODEFAMILY' - Samples: "
 8784                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 8785            )
 8786            log.debug(f"ped_samples={ped_samples}")
 8787
 8788            # Field
 8789            barcode_infos = prefix + tag
 8790
 8791            # Variants table
 8792            table_variants = self.get_table_variants()
 8793
 8794            # Header
 8795            vcf_reader = self.get_header()
 8796
 8797            # Create variant id
 8798            variant_id_column = self.get_variant_id_column()
 8799            added_columns = [variant_id_column]
 8800
 8801            # variant_id, FORMAT and samples
 8802            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8803                ped_samples
 8804            )
 8805
 8806            # Create dataframe
 8807            dataframe_barcode = self.get_query_to_df(
 8808                f""" SELECT {samples_fields} FROM {table_variants} """
 8809            )
 8810
 8811            # Create barcode column
 8812            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 8813                lambda row: barcode(row, samples=ped_samples), axis=1
 8814            )
 8815
 8816            # Add barcode family to header
 8817            # Add vaf_normalization to header
 8818            vcf_reader.formats[tag] = vcf.parser._Format(
 8819                id=tag,
 8820                num=".",
 8821                type="String",
 8822                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 8823                type_code=self.code_type_map.get("String"),
 8824            )
 8825            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 8826                id=f"{tag}S",
 8827                num=".",
 8828                type="String",
 8829                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 8830                type_code=self.code_type_map.get("String"),
 8831            )
 8832
 8833            # Update
 8834            # for sample in ped_samples:
 8835            sql_update_set = []
 8836            for sample in self.get_header_sample_list() + ["FORMAT"]:
 8837                if sample in ped_samples:
 8838                    value = f'dataframe_barcode."{barcode_infos}"'
 8839                    value_samples = "'" + ",".join(ped_samples) + "'"
 8840                elif sample == "FORMAT":
 8841                    value = f"'{tag}'"
 8842                    value_samples = f"'{tag}S'"
 8843                else:
 8844                    value = "'.'"
 8845                    value_samples = "'.'"
 8846                format_regex = r"[a-zA-Z0-9\s]"
 8847                sql_update_set.append(
 8848                    f"""
 8849                        "{sample}" = 
 8850                        concat(
 8851                            CASE
 8852                                WHEN {table_variants}."{sample}" = './.'
 8853                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 8854                                ELSE {table_variants}."{sample}"
 8855                            END,
 8856                            ':',
 8857                            {value},
 8858                            ':',
 8859                            {value_samples}
 8860                        )
 8861                    """
 8862                )
 8863
 8864            sql_update_set_join = ", ".join(sql_update_set)
 8865            sql_update = f"""
 8866                UPDATE {table_variants}
 8867                SET {sql_update_set_join}
 8868                FROM dataframe_barcode
 8869                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 8870            """
 8871            self.conn.execute(sql_update)
 8872
 8873            # Remove added columns
 8874            for added_column in added_columns:
 8875                self.drop_column(column=added_column)
 8876
 8877            # Delete dataframe
 8878            del dataframe_barcode
 8879            gc.collect()
 8880
 8881    def calculation_trio(self) -> None:
 8882        """
 8883        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 8884        information to the INFO field of each variant.
 8885        """
 8886
 8887        # if FORMAT and samples
 8888        if (
 8889            "FORMAT" in self.get_header_columns_as_list()
 8890            and self.get_header_sample_list()
 8891        ):
 8892
 8893            # trio annotation field
 8894            trio_tag = "trio"
 8895
 8896            # VCF infos tags
 8897            vcf_infos_tags = {
 8898                "trio": "trio calculation",
 8899            }
 8900
 8901            # Param
 8902            param = self.get_param()
 8903
 8904            # Prefix
 8905            prefix = self.get_explode_infos_prefix()
 8906
 8907            # Trio param
 8908            trio_ped = (
 8909                param.get("calculation", {})
 8910                .get("calculations", {})
 8911                .get("TRIO", {})
 8912                .get("trio_pedigree", None)
 8913            )
 8914
 8915            # Load trio
 8916            if trio_ped:
 8917
 8918                # Trio pedigree is a file
 8919                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 8920                    log.debug("TRIO pedigree is file")
 8921                    with open(full_path(trio_ped)) as trio_ped:
 8922                        trio_ped = json.load(trio_ped)
 8923
 8924                # Trio pedigree is a string
 8925                elif isinstance(trio_ped, str):
 8926                    log.debug("TRIO pedigree is str")
 8927                    try:
 8928                        trio_ped = json.loads(trio_ped)
 8929                        log.debug("TRIO pedigree is json str")
 8930                    except ValueError as e:
 8931                        trio_samples = trio_ped.split(",")
 8932                        if len(trio_samples) == 3:
 8933                            trio_ped = {
 8934                                "father": trio_samples[0],
 8935                                "mother": trio_samples[1],
 8936                                "child": trio_samples[2],
 8937                            }
 8938                            log.debug("TRIO pedigree is list str")
 8939                        else:
 8940                            msg_error = "TRIO pedigree not well formatted"
 8941                            log.error(msg_error)
 8942                            raise ValueError(msg_error)
 8943
 8944                # Trio pedigree is a dict
 8945                elif isinstance(trio_ped, dict):
 8946                    log.debug("TRIO pedigree is dict")
 8947
 8948                # Trio pedigree is not well formatted
 8949                else:
 8950                    msg_error = "TRIO pedigree not well formatted"
 8951                    log.error(msg_error)
 8952                    raise ValueError(msg_error)
 8953
 8954                # Construct trio list
 8955                trio_samples = [
 8956                    trio_ped.get("father", ""),
 8957                    trio_ped.get("mother", ""),
 8958                    trio_ped.get("child", ""),
 8959                ]
 8960
 8961            else:
 8962                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 8963                samples_list = self.get_header_sample_list()
 8964                if len(samples_list) >= 3:
 8965                    trio_samples = self.get_header_sample_list()[0:3]
 8966                    trio_ped = {
 8967                        "father": trio_samples[0],
 8968                        "mother": trio_samples[1],
 8969                        "child": trio_samples[2],
 8970                    }
 8971                else:
 8972                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 8973                    log.error(msg_error)
 8974                    raise ValueError(msg_error)
 8975
 8976            # Check trio pedigree
 8977            if not trio_ped or len(trio_ped) != 3:
 8978                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 8979                log.error(msg_error)
 8980                raise ValueError(msg_error)
 8981
 8982            # Log
 8983            log.info(
 8984                f"Calculation 'TRIO' - Samples: "
 8985                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 8986            )
 8987
 8988            # Field
 8989            trio_infos = prefix + trio_tag
 8990
 8991            # Variants table
 8992            table_variants = self.get_table_variants()
 8993
 8994            # Header
 8995            vcf_reader = self.get_header()
 8996
 8997            # Create variant id
 8998            variant_id_column = self.get_variant_id_column()
 8999            added_columns = [variant_id_column]
 9000
 9001            # variant_id, FORMAT and samples
 9002            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9003                self.get_header_sample_list()
 9004            )
 9005
 9006            # Create dataframe
 9007            dataframe_trio = self.get_query_to_df(
 9008                f""" SELECT {samples_fields} FROM {table_variants} """
 9009            )
 9010
 9011            # Create trio column
 9012            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9013                lambda row: trio(row, samples=trio_samples), axis=1
 9014            )
 9015
 9016            # Add trio to header
 9017            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9018                trio_tag,
 9019                ".",
 9020                "String",
 9021                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9022                "howard calculation",
 9023                "0",
 9024                self.code_type_map.get("String"),
 9025            )
 9026
 9027            # Update
 9028            sql_update = f"""
 9029                UPDATE {table_variants}
 9030                SET "INFO" = 
 9031                    concat(
 9032                        CASE
 9033                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9034                            THEN ''
 9035                            ELSE concat("INFO", ';')
 9036                        END,
 9037                        CASE
 9038                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9039                             AND dataframe_trio."{trio_infos}" NOT NULL
 9040                            THEN concat(
 9041                                    '{trio_tag}=',
 9042                                    dataframe_trio."{trio_infos}"
 9043                                )
 9044                            ELSE ''
 9045                        END
 9046                    )
 9047                FROM dataframe_trio
 9048                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9049            """
 9050            self.conn.execute(sql_update)
 9051
 9052            # Remove added columns
 9053            for added_column in added_columns:
 9054                self.drop_column(column=added_column)
 9055
 9056            # Delete dataframe
 9057            del dataframe_trio
 9058            gc.collect()
 9059
 9060    def calculation_vaf_normalization(self) -> None:
 9061        """
 9062        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9063        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9064        :return: The function does not return anything.
 9065        """
 9066
 9067        # if FORMAT and samples
 9068        if (
 9069            "FORMAT" in self.get_header_columns_as_list()
 9070            and self.get_header_sample_list()
 9071        ):
 9072
 9073            # vaf_normalization annotation field
 9074            vaf_normalization_tag = "VAF"
 9075
 9076            # VCF infos tags
 9077            vcf_infos_tags = {
 9078                "VAF": "VAF Variant Frequency",
 9079            }
 9080
 9081            # Prefix
 9082            prefix = self.get_explode_infos_prefix()
 9083
 9084            # Variants table
 9085            table_variants = self.get_table_variants()
 9086
 9087            # Header
 9088            vcf_reader = self.get_header()
 9089
 9090            # Do not calculate if VAF already exists
 9091            if "VAF" in vcf_reader.formats:
 9092                log.debug("VAF already on genotypes")
 9093                return
 9094
 9095            # Create variant id
 9096            variant_id_column = self.get_variant_id_column()
 9097            added_columns = [variant_id_column]
 9098
 9099            # variant_id, FORMAT and samples
 9100            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9101                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9102            )
 9103
 9104            # Create dataframe
 9105            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9106            log.debug(f"query={query}")
 9107            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9108
 9109            vaf_normalization_set = []
 9110
 9111            # for each sample vaf_normalization
 9112            for sample in self.get_header_sample_list():
 9113                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9114                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9115                )
 9116                vaf_normalization_set.append(
 9117                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9118                )
 9119
 9120            # Add VAF to FORMAT
 9121            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9122                "FORMAT"
 9123            ].apply(lambda x: str(x) + ":VAF")
 9124            vaf_normalization_set.append(
 9125                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9126            )
 9127
 9128            # Add vaf_normalization to header
 9129            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9130                id=vaf_normalization_tag,
 9131                num="1",
 9132                type="Float",
 9133                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9134                type_code=self.code_type_map.get("Float"),
 9135            )
 9136
 9137            # Create fields to add in INFO
 9138            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9139
 9140            # Update
 9141            sql_update = f"""
 9142                UPDATE {table_variants}
 9143                SET {sql_vaf_normalization_set}
 9144                FROM dataframe_vaf_normalization
 9145                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9146
 9147            """
 9148            self.conn.execute(sql_update)
 9149
 9150            # Remove added columns
 9151            for added_column in added_columns:
 9152                self.drop_column(column=added_column)
 9153
 9154            # Delete dataframe
 9155            del dataframe_vaf_normalization
 9156            gc.collect()
 9157
 9158    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9159        """
 9160        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9161        field in a VCF file and updates the INFO column of the variants table with the calculated
 9162        statistics.
 9163
 9164        :param info: The `info` parameter is a string that represents the type of information for which
 9165        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9166        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9167        maximum value, the mean, the median, defaults to VAF
 9168        :type info: str (optional)
 9169        """
 9170
 9171        # if FORMAT and samples
 9172        if (
 9173            "FORMAT" in self.get_header_columns_as_list()
 9174            and self.get_header_sample_list()
 9175        ):
 9176
 9177            # vaf_stats annotation field
 9178            vaf_stats_tag = info + "_stats"
 9179
 9180            # VCF infos tags
 9181            vcf_infos_tags = {
 9182                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9183                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9184                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9185                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9186                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9187                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9188                info
 9189                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9190            }
 9191
 9192            # Prefix
 9193            prefix = self.get_explode_infos_prefix()
 9194
 9195            # Field
 9196            vaf_stats_infos = prefix + vaf_stats_tag
 9197
 9198            # Variants table
 9199            table_variants = self.get_table_variants()
 9200
 9201            # Header
 9202            vcf_reader = self.get_header()
 9203
 9204            # Create variant id
 9205            variant_id_column = self.get_variant_id_column()
 9206            added_columns = [variant_id_column]
 9207
 9208            # variant_id, FORMAT and samples
 9209            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9210                self.get_header_sample_list()
 9211            )
 9212
 9213            # Create dataframe
 9214            dataframe_vaf_stats = self.get_query_to_df(
 9215                f""" SELECT {samples_fields} FROM {table_variants} """
 9216            )
 9217
 9218            # Create vaf_stats column
 9219            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9220                lambda row: genotype_stats(
 9221                    row, samples=self.get_header_sample_list(), info=info
 9222                ),
 9223                axis=1,
 9224            )
 9225
 9226            # List of vcf tags
 9227            sql_vaf_stats_fields = []
 9228
 9229            # Check all VAF stats infos
 9230            for stat in vcf_infos_tags:
 9231
 9232                # Extract stats
 9233                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9234                    lambda x: dict(x).get(stat, "")
 9235                )
 9236
 9237                # Add snpeff_hgvs to header
 9238                vcf_reader.infos[stat] = vcf.parser._Info(
 9239                    stat,
 9240                    ".",
 9241                    "String",
 9242                    vcf_infos_tags.get(stat, "genotype statistics"),
 9243                    "howard calculation",
 9244                    "0",
 9245                    self.code_type_map.get("String"),
 9246                )
 9247
 9248                if len(sql_vaf_stats_fields):
 9249                    sep = ";"
 9250                else:
 9251                    sep = ""
 9252
 9253                # Create fields to add in INFO
 9254                sql_vaf_stats_fields.append(
 9255                    f"""
 9256                        CASE
 9257                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9258                            THEN concat(
 9259                                    '{sep}{stat}=',
 9260                                    dataframe_vaf_stats."{stat}"
 9261                                )
 9262                            ELSE ''
 9263                        END
 9264                    """
 9265                )
 9266
 9267            # SQL set for update
 9268            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9269
 9270            # Update
 9271            sql_update = f"""
 9272                UPDATE {table_variants}
 9273                SET "INFO" = 
 9274                    concat(
 9275                        CASE
 9276                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9277                            THEN ''
 9278                            ELSE concat("INFO", ';')
 9279                        END,
 9280                        {sql_vaf_stats_fields_set}
 9281                    )
 9282                FROM dataframe_vaf_stats
 9283                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9284
 9285            """
 9286            self.conn.execute(sql_update)
 9287
 9288            # Remove added columns
 9289            for added_column in added_columns:
 9290                self.drop_column(column=added_column)
 9291
 9292            # Delete dataframe
 9293            del dataframe_vaf_stats
 9294            gc.collect()
 9295
 9296    def calculation_transcripts_json(self, info: str = "transcripts_json") -> None:
 9297        """
 9298        The function `calculation_transcripts_json` creates a transcripts table and adds an info field
 9299        to it if transcripts are available.
 9300
 9301        :param info: The `info` parameter in the `calculation_transcripts_json` method is a string
 9302        parameter that specifies the information field to be used in the transcripts JSON. It has a
 9303        default value of "transcripts_json" if no value is provided when calling the method, defaults to
 9304        transcripts_json
 9305        :type info: str (optional)
 9306        """
 9307
 9308        # Create transcripts table
 9309        transcripts_table = self.create_transcript_view()
 9310
 9311        # Add info field
 9312        if transcripts_table:
 9313            self.transcript_view_to_variants(
 9314                transcripts_table=transcripts_table, transcripts_info_field=info
 9315            )
 9316        else:
 9317            log.info("No Transcripts to process. Check param.json file configuration")
 9318
 9319    def calculation_transcripts_prioritization(self) -> None:
 9320        """
 9321        The function `calculation_transcripts_prioritization` creates a transcripts table and
 9322        prioritizes transcripts based on certain criteria.
 9323        """
 9324
 9325        # Create transcripts table
 9326        transcripts_table = self.create_transcript_view()
 9327
 9328        # Add info field
 9329        if transcripts_table:
 9330            self.transcripts_prioritization(transcripts_table=transcripts_table)
 9331        else:
 9332            log.info("No Transcripts to process. Check param.json file configuration")
 9333
 9334    ###############
 9335    # Transcripts #
 9336    ###############
 9337
 9338    def transcripts_prioritization(
 9339        self, transcripts_table: str = None, param: dict = {}
 9340    ) -> bool:
 9341        """
 9342        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
 9343        and updates the variants table with the prioritized information.
 9344
 9345        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
 9346        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
 9347        This parameter is used to identify the table where the transcripts data is stored for the
 9348        prioritization process
 9349        :type transcripts_table: str
 9350        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
 9351        that contains various configuration settings for the prioritization process of transcripts. It
 9352        is used to customize the behavior of the prioritization algorithm and includes settings such as
 9353        the prefix for prioritization fields, default profiles, and other
 9354        :type param: dict
 9355        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
 9356        transcripts prioritization process is successfully completed, and `False` if there are any
 9357        issues or if no profile is defined for transcripts prioritization.
 9358        """
 9359
 9360        log.debug("Start transcripts prioritization...")
 9361
 9362        # Param
 9363        if not param:
 9364            param = self.get_param()
 9365
 9366        # Variants table
 9367        table_variants = self.get_table_variants()
 9368        log.debug(f"transcripts_table={transcripts_table}")
 9369        # Transcripts table
 9370        if transcripts_table is None:
 9371            log.debug(f"transcripts_table={transcripts_table}")
 9372            transcripts_table = self.create_transcript_view(
 9373                transcripts_table="transcripts", param=param
 9374            )
 9375            log.debug(f"transcripts_table={transcripts_table}")
 9376        if transcripts_table is None:
 9377            msg_err = "No Transcripts table availalble"
 9378            log.error(msg_err)
 9379            raise ValueError(msg_err)
 9380
 9381        # Get transcripts columns
 9382        columns_as_list_query = f"""
 9383            DESCRIBE {transcripts_table}
 9384        """
 9385        columns_as_list = list(
 9386            self.get_query_to_df(columns_as_list_query)["column_name"]
 9387        )
 9388
 9389        # Create INFO if not exists
 9390        if "INFO" not in columns_as_list:
 9391            query_add_info = f"""
 9392                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
 9393            """
 9394            self.execute_query(query_add_info)
 9395
 9396        # Prioritization param and Force only PZ Score and Flag
 9397        pz_param = param.get("transcripts", {}).get("prioritization", {})
 9398        pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score"
 9399        pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag"
 9400        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
 9401        pz_param["pzfields"] = [pz_fields_score, pz_fields_flag]
 9402        pz_profile_default = (
 9403            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
 9404        )
 9405
 9406        # Exit if no profile
 9407        if pz_profile_default is None:
 9408            log.warning("No profile defined for transcripts prioritization")
 9409            return False
 9410
 9411        # Prioritization
 9412        prioritization_result = self.prioritization(
 9413            table=transcripts_table,
 9414            pz_param=param.get("transcripts", {}).get("prioritization", {}),
 9415        )
 9416        if not prioritization_result:
 9417            log.warning("Transcripts prioritization not processed")
 9418            return False
 9419
 9420        # Explode PZ fields
 9421        self.explode_infos(
 9422            table=transcripts_table,
 9423            fields=param.get("transcripts", {})
 9424            .get("prioritization", {})
 9425            .get("pzfields", []),
 9426        )
 9427
 9428        # Export Transcripts prioritization infos to variants table
 9429        query_update = f"""
 9430            WITH RankedTranscripts AS (
 9431                SELECT
 9432                    "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag},
 9433                    ROW_NUMBER() OVER (
 9434                        PARTITION BY "#CHROM", POS, REF, ALT
 9435                        ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC
 9436                    ) AS rn
 9437                FROM
 9438                    {transcripts_table}
 9439            )
 9440            UPDATE {table_variants}
 9441                SET
 9442                INFO = CONCAT(CASE
 9443                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9444                            THEN ''
 9445                            ELSE concat("INFO", ';')
 9446                        END,
 9447                        concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag})
 9448                        )
 9449            FROM
 9450                RankedTranscripts
 9451            WHERE
 9452                rn = 1
 9453                AND variants."#CHROM" = RankedTranscripts."#CHROM"
 9454                AND variants."POS" = RankedTranscripts."POS"
 9455                AND variants."REF" = RankedTranscripts."REF"
 9456                AND variants."ALT" = RankedTranscripts."ALT"
 9457                
 9458        """
 9459        self.execute_query(query=query_update)
 9460
 9461        # Add PZ Transcript in header
 9462        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
 9463            pz_fields_transcripts,
 9464            ".",
 9465            "String",
 9466            f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}",
 9467            "unknown",
 9468            "unknown",
 9469            code_type_map["String"],
 9470        )
 9471
 9472        # Return
 9473        return True
 9474
 9475    def create_transcript_view_from_columns_map(
 9476        self,
 9477        transcripts_table: str = "transcripts",
 9478        columns_maps: dict = {},
 9479        added_columns: list = [],
 9480        temporary_tables: list = None,
 9481        annotation_fields: list = None,
 9482    ) -> tuple[list, list, list]:
 9483        """
 9484        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
 9485        specified columns mapping for transcripts data.
 9486
 9487        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9488        the table where the transcripts data is stored or will be stored in the database. This table
 9489        typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores,
 9490        predictions, etc. It defaults to "transcripts, defaults to transcripts
 9491        :type transcripts_table: str (optional)
 9492        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about
 9493        how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list
 9494        represents a mapping configuration for a specific set of columns. It typically includes details such
 9495        as the main transcript column and additional information columns
 9496        :type columns_maps: dict
 9497        :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map`
 9498        function is a list that stores the additional columns that will be added to the view being created
 9499        based on the columns map provided. These columns are generated by exploding the transcript
 9500        information columns along with the main transcript column
 9501        :type added_columns: list
 9502        :param temporary_tables: The `temporary_tables` parameter in the
 9503        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
 9504        tables created during the process of creating a transcript view from a columns map. These temporary
 9505        tables are used to store intermediate results or transformations before the final view is generated
 9506        :type temporary_tables: list
 9507        :param annotation_fields: The `annotation_fields` parameter in the
 9508        `create_transcript_view_from_columns_map` function is a list that stores the fields that are used
 9509        for annotation in the query view creation process. These fields are extracted from the
 9510        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
 9511        :type annotation_fields: list
 9512        :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three
 9513        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
 9514        """
 9515
 9516        log.debug("Start transcrpts view creation from columns map...")
 9517
 9518        # "from_columns_map": [
 9519        #     {
 9520        #         "transcripts_column": "Ensembl_transcriptid",
 9521        #         "transcripts_infos_columns": [
 9522        #             "genename",
 9523        #             "Ensembl_geneid",
 9524        #             "LIST_S2_score",
 9525        #             "LIST_S2_pred",
 9526        #         ],
 9527        #     },
 9528        #     {
 9529        #         "transcripts_column": "Ensembl_transcriptid",
 9530        #         "transcripts_infos_columns": [
 9531        #             "genename",
 9532        #             "VARITY_R_score",
 9533        #             "Aloft_pred",
 9534        #         ],
 9535        #     },
 9536        # ],
 9537
 9538        # Init
 9539        if temporary_tables is None:
 9540            temporary_tables = []
 9541        if annotation_fields is None:
 9542            annotation_fields = []
 9543
 9544        # Variants table
 9545        table_variants = self.get_table_variants()
 9546
 9547        for columns_map in columns_maps:
 9548
 9549            # Transcript column
 9550            transcripts_column = columns_map.get("transcripts_column", None)
 9551
 9552            # Transcripts infos columns
 9553            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
 9554
 9555            if transcripts_column is not None:
 9556
 9557                # Explode
 9558                added_columns += self.explode_infos(
 9559                    fields=[transcripts_column] + transcripts_infos_columns
 9560                )
 9561
 9562                # View clauses
 9563                clause_select = []
 9564                for field in [transcripts_column] + transcripts_infos_columns:
 9565                    clause_select.append(
 9566                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
 9567                    )
 9568                    if field not in [transcripts_column]:
 9569                        annotation_fields.append(field)
 9570
 9571                # Querey View
 9572                query = f""" 
 9573                    SELECT
 9574                        "#CHROM", POS, REF, ALT,
 9575                        "{transcripts_column}" AS 'transcript',
 9576                        {", ".join(clause_select)}
 9577                    FROM (
 9578                        SELECT 
 9579                            "#CHROM", POS, REF, ALT,
 9580                            {", ".join(clause_select)}
 9581                        FROM {table_variants}
 9582                        )
 9583                    WHERE "{transcripts_column}" IS NOT NULL
 9584                """
 9585
 9586                # Create temporary table
 9587                temporary_table = transcripts_table + "".join(
 9588                    random.choices(string.ascii_uppercase + string.digits, k=10)
 9589                )
 9590
 9591                # Temporary_tables
 9592                temporary_tables.append(temporary_table)
 9593                query_view = f"""
 9594                    CREATE TEMPORARY TABLE {temporary_table}
 9595                    AS ({query})
 9596                """
 9597                self.execute_query(query=query_view)
 9598
 9599        return added_columns, temporary_tables, annotation_fields
 9600
 9601    def create_transcript_view_from_column_format(
 9602        self,
 9603        transcripts_table: str = "transcripts",
 9604        column_formats: dict = {},
 9605        temporary_tables: list = None,
 9606        annotation_fields: list = None,
 9607    ) -> tuple[list, list, list]:
 9608        """
 9609        The `create_transcript_view_from_column_format` function generates a transcript view based on
 9610        specified column formats, adds additional columns and annotation fields, and returns the list of
 9611        temporary tables and annotation fields.
 9612
 9613        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9614        the table containing the transcripts data. This table will be used as the base table for creating
 9615        the transcript view. The default value for this parameter is "transcripts", but you can provide a
 9616        different table name if needed, defaults to transcripts
 9617        :type transcripts_table: str (optional)
 9618        :param column_formats: The `column_formats` parameter is a dictionary that contains information
 9619        about the columns to be used for creating the transcript view. Each entry in the dictionary
 9620        specifies the mapping between a transcripts column and a transcripts infos column. For example, in
 9621        the provided code snippet:
 9622        :type column_formats: dict
 9623        :param temporary_tables: The `temporary_tables` parameter in the
 9624        `create_transcript_view_from_column_format` function is a list that stores the names of temporary
 9625        views created during the process of creating a transcript view from a column format. These temporary
 9626        views are used to manipulate and extract data before generating the final transcript view. It
 9627        :type temporary_tables: list
 9628        :param annotation_fields: The `annotation_fields` parameter in the
 9629        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
 9630        that are extracted from the temporary views created during the process. These annotation fields are
 9631        obtained by querying the temporary views and extracting the column names excluding specific columns
 9632        like `#CH
 9633        :type annotation_fields: list
 9634        :return: The `create_transcript_view_from_column_format` function returns two lists:
 9635        `temporary_tables` and `annotation_fields`.
 9636        """
 9637
 9638        log.debug("Start transcrpts view creation from column format...")
 9639
 9640        #  "from_column_format": [
 9641        #     {
 9642        #         "transcripts_column": "ANN",
 9643        #         "transcripts_infos_column": "Feature_ID",
 9644        #     }
 9645        # ],
 9646
 9647        # Init
 9648        if temporary_tables is None:
 9649            temporary_tables = []
 9650        if annotation_fields is None:
 9651            annotation_fields = []
 9652
 9653        for column_format in column_formats:
 9654
 9655            # annotation field and transcript annotation field
 9656            annotation_field = column_format.get("transcripts_column", "ANN")
 9657            transcript_annotation = column_format.get(
 9658                "transcripts_infos_column", "Feature_ID"
 9659            )
 9660
 9661            # Temporary View name
 9662            temporary_view_name = transcripts_table + "".join(
 9663                random.choices(string.ascii_uppercase + string.digits, k=10)
 9664            )
 9665
 9666            # Create temporary view name
 9667            temporary_view_name = self.annotation_format_to_table(
 9668                uniquify=True,
 9669                annotation_field=annotation_field,
 9670                view_name=temporary_view_name,
 9671                annotation_id=transcript_annotation,
 9672            )
 9673
 9674            # Annotation fields
 9675            if temporary_view_name:
 9676                query_annotation_fields = f"""
 9677                    SELECT *
 9678                    FROM (
 9679                        DESCRIBE SELECT *
 9680                        FROM {temporary_view_name}
 9681                        )
 9682                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
 9683                """
 9684                df_annotation_fields = self.get_query_to_df(
 9685                    query=query_annotation_fields
 9686                )
 9687
 9688                # Add temporary view and annotation fields
 9689                temporary_tables.append(temporary_view_name)
 9690                annotation_fields += list(set(df_annotation_fields["column_name"]))
 9691
 9692        return temporary_tables, annotation_fields
 9693
 9694    def create_transcript_view(
 9695        self,
 9696        transcripts_table: str = None,
 9697        transcripts_table_drop: bool = True,
 9698        param: dict = {},
 9699    ) -> str:
 9700        """
 9701        The `create_transcript_view` function generates a transcript view by processing data from a
 9702        specified table based on provided parameters and structural information.
 9703
 9704        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
 9705        is used to specify the name of the table that will store the final transcript view data. If a table
 9706        name is not provided, the function will create a new table to store the transcript view data, and by
 9707        default,, defaults to transcripts
 9708        :type transcripts_table: str (optional)
 9709        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
 9710        `create_transcript_view` function is a boolean parameter that determines whether to drop the
 9711        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
 9712        the function will drop the existing transcripts table if it exists, defaults to True
 9713        :type transcripts_table_drop: bool (optional)
 9714        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
 9715        contains information needed to create a transcript view. It includes details such as the structure
 9716        of the transcripts, columns mapping, column formats, and other necessary information for generating
 9717        the view. This parameter allows for flexibility and customization
 9718        :type param: dict
 9719        :return: The `create_transcript_view` function returns the name of the transcripts table that was
 9720        created or modified during the execution of the function.
 9721        """
 9722
 9723        log.debug("Start transcripts view creation...")
 9724
 9725        # Default
 9726        transcripts_table_default = "transcripts"
 9727
 9728        # Param
 9729        if not param:
 9730            param = self.get_param()
 9731
 9732        # Struct
 9733        struct = param.get("transcripts", {}).get("struct", None)
 9734
 9735        if struct:
 9736
 9737            # Transcripts table
 9738            if transcripts_table is None:
 9739                transcripts_table = param.get("transcripts", {}).get(
 9740                    "table", transcripts_table_default
 9741                )
 9742
 9743            # added_columns
 9744            added_columns = []
 9745
 9746            # Temporary tables
 9747            temporary_tables = []
 9748
 9749            # Annotation fields
 9750            annotation_fields = []
 9751
 9752            # from columns map
 9753            columns_maps = struct.get("from_columns_map", [])
 9754            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
 9755                self.create_transcript_view_from_columns_map(
 9756                    transcripts_table=transcripts_table,
 9757                    columns_maps=columns_maps,
 9758                    added_columns=added_columns,
 9759                    temporary_tables=temporary_tables,
 9760                    annotation_fields=annotation_fields,
 9761                )
 9762            )
 9763            added_columns += added_columns_tmp
 9764            temporary_tables += temporary_tables_tmp
 9765            annotation_fields += annotation_fields_tmp
 9766
 9767            # from column format
 9768            column_formats = struct.get("from_column_format", [])
 9769            temporary_tables_tmp, annotation_fields_tmp = (
 9770                self.create_transcript_view_from_column_format(
 9771                    transcripts_table=transcripts_table,
 9772                    column_formats=column_formats,
 9773                    temporary_tables=temporary_tables,
 9774                    annotation_fields=annotation_fields,
 9775                )
 9776            )
 9777            temporary_tables += temporary_tables_tmp
 9778            annotation_fields += annotation_fields_tmp
 9779
 9780            # Merge temporary tables query
 9781            query_merge = ""
 9782            for temporary_table in temporary_tables:
 9783
 9784                # First temporary table
 9785                if not query_merge:
 9786                    query_merge = f"""
 9787                        SELECT * FROM {temporary_table}
 9788                    """
 9789                # other temporary table (using UNION)
 9790                else:
 9791                    query_merge += f"""
 9792                        UNION BY NAME SELECT * FROM {temporary_table}
 9793                    """
 9794
 9795            # Merge on transcript
 9796            query_merge_on_transcripts_annotation_fields = []
 9797            # Aggregate all annotations fields
 9798            for annotation_field in set(annotation_fields):
 9799                query_merge_on_transcripts_annotation_fields.append(
 9800                    f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
 9801                )
 9802            # Query for transcripts view
 9803            query_merge_on_transcripts = f"""
 9804                SELECT "#CHROM", POS, REF, ALT, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
 9805                FROM ({query_merge})
 9806                GROUP BY "#CHROM", POS, REF, ALT, transcript
 9807            """
 9808
 9809            # Drop transcript view is necessary
 9810            if transcripts_table_drop:
 9811                query_drop = f"""
 9812                    DROP TABLE IF EXISTS {transcripts_table};
 9813                """
 9814                self.execute_query(query=query_drop)
 9815
 9816            # Merge and create transcript view
 9817            query_create_view = f"""
 9818                CREATE TABLE IF NOT EXISTS {transcripts_table}
 9819                AS {query_merge_on_transcripts}
 9820            """
 9821            self.execute_query(query=query_create_view)
 9822
 9823            # Remove added columns
 9824            for added_column in added_columns:
 9825                self.drop_column(column=added_column)
 9826
 9827        else:
 9828
 9829            transcripts_table = None
 9830
 9831        return transcripts_table
 9832
 9833    def annotation_format_to_table(
 9834        self,
 9835        uniquify: bool = True,
 9836        annotation_field: str = "ANN",
 9837        annotation_id: str = "Feature_ID",
 9838        view_name: str = "transcripts",
 9839    ) -> str:
 9840        """
 9841        The function `annotation_format_to_table` converts annotation data from a VCF file into a structured
 9842        table format.
 9843
 9844        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique
 9845        values in the output or not. If set to `True`, the function will make sure that the output values
 9846        are unique, defaults to True
 9847        :type uniquify: bool (optional)
 9848        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that
 9849        contains the annotation information for each variant. This field is used to extract the annotation
 9850        details for further processing in the function, defaults to ANN
 9851        :type annotation_field: str (optional)
 9852        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is
 9853        used to specify the identifier for the annotation feature. This identifier will be used as a column
 9854        name in the resulting table or view that is created based on the annotation data. It helps in
 9855        uniquely identifying each annotation entry in the, defaults to Feature_ID
 9856        :type annotation_id: str (optional)
 9857        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to
 9858        specify the name of the temporary table that will be created to store the transformed annotation
 9859        data. This table will hold the extracted information from the annotation field in a structured
 9860        format for further processing or analysis, defaults to transcripts
 9861        :type view_name: str (optional)
 9862        :return: The function `annotation_format_to_table` is returning the name of the view created, which
 9863        is stored in the variable `view_name`.
 9864        """
 9865
 9866        # Annotation field
 9867        annotation_format = "annotation_explode"
 9868
 9869        # Transcript annotation
 9870        annotation_id = "".join(char for char in annotation_id if char.isalnum())
 9871
 9872        # Prefix
 9873        prefix = self.get_explode_infos_prefix()
 9874        if prefix:
 9875            prefix = "INFO/"
 9876
 9877        # Annotation fields
 9878        annotation_infos = prefix + annotation_field
 9879        annotation_format_infos = prefix + annotation_format
 9880
 9881        # Variants table
 9882        table_variants = self.get_table_variants()
 9883
 9884        # Header
 9885        vcf_reader = self.get_header()
 9886
 9887        # Add columns
 9888        added_columns = []
 9889
 9890        # Explode HGVS field in column
 9891        added_columns += self.explode_infos(fields=[annotation_field])
 9892
 9893        if annotation_field in vcf_reader.infos:
 9894
 9895            # Extract ANN header
 9896            ann_description = vcf_reader.infos[annotation_field].desc
 9897            pattern = r"'(.+?)'"
 9898            match = re.search(pattern, ann_description)
 9899            if match:
 9900                ann_header_match = match.group(1).split(" | ")
 9901                ann_header = []
 9902                ann_header_desc = {}
 9903                for i in range(len(ann_header_match)):
 9904                    ann_header_info = "".join(
 9905                        char for char in ann_header_match[i] if char.isalnum()
 9906                    )
 9907                    ann_header.append(ann_header_info)
 9908                    ann_header_desc[ann_header_info] = ann_header_match[i]
 9909                if not ann_header_desc:
 9910                    raise ValueError("Invalid header description format")
 9911            else:
 9912                raise ValueError("Invalid header description format")
 9913
 9914            # Create variant id
 9915            variant_id_column = self.get_variant_id_column()
 9916            added_columns += [variant_id_column]
 9917
 9918            # Create dataframe
 9919            dataframe_annotation_format = self.get_query_to_df(
 9920                f""" SELECT "#CHROM", POS, REF, ALT, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
 9921            )
 9922
 9923            # Create annotation columns
 9924            dataframe_annotation_format[
 9925                annotation_format_infos
 9926            ] = dataframe_annotation_format[annotation_infos].apply(
 9927                lambda x: explode_annotation_format(
 9928                    annotation=str(x),
 9929                    uniquify=uniquify,
 9930                    output_format="JSON",
 9931                    prefix="",
 9932                    header=list(ann_header_desc.values()),
 9933                )
 9934            )
 9935
 9936            # Find keys
 9937            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
 9938            df_keys = self.get_query_to_df(query=query_json)
 9939
 9940            # Check keys
 9941            query_json_key = []
 9942            for _, row in df_keys.iterrows():
 9943
 9944                # Key
 9945                key = row.iloc[0]
 9946
 9947                # key_clean
 9948                key_clean = "".join(char for char in key if char.isalnum())
 9949
 9950                # Type
 9951                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
 9952
 9953                # Get DataFrame from query
 9954                df_json_type = self.get_query_to_df(query=query_json_type)
 9955
 9956                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
 9957                with pd.option_context("future.no_silent_downcasting", True):
 9958                    df_json_type.fillna(value="", inplace=True)
 9959                    replace_dict = {None: np.nan, "": np.nan}
 9960                    df_json_type.replace(replace_dict, inplace=True)
 9961                    df_json_type.dropna(inplace=True)
 9962
 9963                # Detect column type
 9964                column_type = detect_column_type(df_json_type[key_clean])
 9965
 9966                # Append
 9967                query_json_key.append(
 9968                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
 9969                )
 9970
 9971            # Create view
 9972            query_view = f"""CREATE TEMPORARY TABLE {view_name} AS (SELECT *, {annotation_id} AS 'transcript' FROM (SELECT "#CHROM", POS, REF, ALT, {",".join(query_json_key)} FROM dataframe_annotation_format));"""
 9973            self.execute_query(query=query_view)
 9974
 9975        else:
 9976
 9977            # Return None
 9978            view_name = None
 9979
 9980        # Remove added columns
 9981        for added_column in added_columns:
 9982            self.drop_column(column=added_column)
 9983
 9984        return view_name
 9985
 9986    def transcript_view_to_variants(
 9987        self,
 9988        transcripts_table: str = None,
 9989        transcripts_column_id: str = None,
 9990        transcripts_info_json: str = None,
 9991        transcripts_info_field: str = None,
 9992        param: dict = {},
 9993    ) -> bool:
 9994        """
 9995        The function `transcript_view_to_variants` takes input parameters related to transcripts and updates
 9996        a variants table with information from the transcripts in JSON format.
 9997
 9998        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the table
 9999        containing the transcripts data. If this parameter is not provided, the function will attempt to
10000        retrieve it from the `param` dictionary or use a default value of "transcripts"
10001        :type transcripts_table: str
10002        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the column in
10003        the `transcripts_table` that contains the unique identifier for each transcript. This identifier is
10004        used to match transcripts with variants in the database
10005        :type transcripts_column_id: str
10006        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name of
10007        the column in the variants table where the transcripts information will be stored in JSON format
10008        :type transcripts_info_json: str
10009        :param transcripts_info_field: The `transcripts_info_field` parameter is used to specify the field
10010        in the VCF header that will contain information about transcripts in JSON format. This field will be
10011        added to the VCF header as an INFO field with the specified name
10012        :type transcripts_info_field: str
10013        :param param: The `transcript_view_to_variants` method takes several parameters:
10014        :type param: dict
10015        :return: The function `transcript_view_to_variants` returns a boolean value, which is `True` if the
10016        operation is successful and `False` if certain conditions are not met.
10017        """
10018
10019        log.debug("Start transcripts view to JSON...")
10020
10021        # Default
10022        transcripts_table_default = "transcripts"
10023        transcripts_column_id_default = "transcript"
10024        transcripts_info_json_default = None
10025        transcripts_info_field_default = None
10026
10027        # Param
10028        if not param:
10029            param = self.get_param()
10030
10031        # Transcripts table
10032        if transcripts_table is None:
10033            transcripts_table = param.get("transcripts", {}).get(
10034                "table", transcripts_table_default
10035            )
10036
10037        # Transcripts column ID
10038        if transcripts_column_id is None:
10039            transcripts_column_id = param.get("transcripts", {}).get(
10040                "column_id", transcripts_column_id_default
10041            )
10042
10043        # Transcripts info field
10044        if transcripts_info_json is None:
10045            transcripts_info_json = param.get("transcripts", {}).get(
10046                "transcripts_info_json", transcripts_info_json_default
10047            )
10048
10049        # Transcripts info field
10050        if transcripts_info_field is None:
10051            transcripts_info_field = param.get("transcripts", {}).get(
10052                "transcripts_info_field", transcripts_info_field_default
10053            )
10054
10055        # Variants table
10056        table_variants = self.get_table_variants()
10057
10058        # Check info columns param
10059        if transcripts_info_json is None and transcripts_info_field is None:
10060            return False
10061
10062        # Transcripts infos columns
10063        query_transcripts_infos_columns = f"""
10064            SELECT *
10065            FROM (
10066                DESCRIBE SELECT * FROM {transcripts_table}
10067                )
10068            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
10069        """
10070        transcripts_infos_columns = list(
10071            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
10072        )
10073
10074        # View results
10075        clause_select = []
10076        clause_to_json = []
10077        for field in transcripts_infos_columns:
10078            clause_select.append(
10079                f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10080            )
10081            clause_to_json.append(f""" '{field}': "{field}" """)
10082
10083        # Update
10084        update_set = []
10085
10086        # VCF header
10087        vcf_reader = self.get_header()
10088
10089        # Transcripts to info column in JSON
10090        if transcripts_info_json is not None:
10091
10092            # Create column on variants table
10093            self.add_column(
10094                table_name=table_variants,
10095                column_name=transcripts_info_json,
10096                column_type="JSON",
10097                default_value=None,
10098                drop=False,
10099            )
10100
10101            # Add to update
10102            update_set.append(
10103                f""" {transcripts_info_json}=t.{transcripts_info_json} """
10104            )
10105
10106            # Add header
10107            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
10108                transcripts_info_json,
10109                ".",
10110                "String",
10111                "Transcripts in JSON format",
10112                "unknwon",
10113                "unknwon",
10114                self.code_type_map["String"],
10115            )
10116
10117        # Transcripts to info field in JSON
10118        if transcripts_info_field is not None:
10119
10120            # Add to update
10121            update_set.append(
10122                f""" 
10123                    INFO = concat(
10124                            CASE
10125                                WHEN INFO NOT IN ('', '.')
10126                                THEN INFO
10127                                ELSE ''
10128                            END,
10129                            CASE
10130                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
10131                                THEN concat(
10132                                    ';{transcripts_info_field}=',
10133                                    t.{transcripts_info_json}
10134                                )
10135                                ELSE ''
10136                            END
10137                            )
10138                """
10139            )
10140
10141            # Add header
10142            vcf_reader.infos[transcripts_info_field] = vcf.parser._Info(
10143                transcripts_info_field,
10144                ".",
10145                "String",
10146                "Transcripts in JSON format",
10147                "unknwon",
10148                "unknwon",
10149                self.code_type_map["String"],
10150            )
10151
10152        # Update query
10153        query_update = f"""
10154            UPDATE {table_variants}
10155                SET {", ".join(update_set)}
10156            FROM
10157            (
10158                SELECT
10159                    "#CHROM", POS, REF, ALT,
10160                        concat(
10161                        '{{',
10162                        string_agg(
10163                            '"' || "{transcripts_column_id}" || '":' ||
10164                            to_json(json_output)
10165                        ),
10166                        '}}'
10167                        )::JSON AS {transcripts_info_json}
10168                FROM
10169                    (
10170                    SELECT
10171                        "#CHROM", POS, REF, ALT,
10172                        "{transcripts_column_id}",
10173                        to_json(
10174                            {{{",".join(clause_to_json)}}}
10175                        )::JSON AS json_output
10176                    FROM
10177                        (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10178                    WHERE "{transcripts_column_id}" IS NOT NULL
10179                    )
10180                GROUP BY "#CHROM", POS, REF, ALT
10181            ) AS t
10182            WHERE {table_variants}."#CHROM" = t."#CHROM"
10183                AND {table_variants}."POS" = t."POS"
10184                AND {table_variants}."REF" = t."REF"
10185                AND {table_variants}."ALT" = t."ALT"
10186        """
10187
10188        self.execute_query(query=query_update)
10189
10190        return True
class Variants:
   34class Variants:
   35
   36    def __init__(
   37        self,
   38        conn=None,
   39        input: str = None,
   40        output: str = None,
   41        config: dict = {},
   42        param: dict = {},
   43        load: bool = False,
   44    ) -> None:
   45        """
   46        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   47        header
   48
   49        :param conn: the connection to the database
   50        :param input: the input file
   51        :param output: the output file
   52        :param config: a dictionary containing the configuration of the model
   53        :param param: a dictionary containing the parameters of the model
   54        """
   55
   56        # Init variables
   57        self.init_variables()
   58
   59        # Input
   60        self.set_input(input)
   61
   62        # Config
   63        self.set_config(config)
   64
   65        # Param
   66        self.set_param(param)
   67
   68        # Output
   69        self.set_output(output)
   70
   71        # connexion
   72        self.set_connexion(conn)
   73
   74        # Header
   75        self.set_header()
   76
   77        # Load data
   78        if load:
   79            self.load_data()
   80
   81    def set_input(self, input: str = None) -> None:
   82        """
   83        The function `set_input` takes a file name as input, extracts the name and extension, and sets
   84        attributes in the class accordingly.
   85
   86        :param input: The `set_input` method in the provided code snippet is used to set attributes
   87        related to the input file. Here's a breakdown of the parameters and their usage in the method:
   88        :type input: str
   89        """
   90
   91        if input and not isinstance(input, str):
   92            try:
   93                self.input = input.name
   94            except:
   95                log.error(f"Input file '{input} in bad format")
   96                raise ValueError(f"Input file '{input} in bad format")
   97        else:
   98            self.input = input
   99
  100        # Input format
  101        if input:
  102            input_name, input_extension = os.path.splitext(self.input)
  103            self.input_name = input_name
  104            self.input_extension = input_extension
  105            self.input_format = self.input_extension.replace(".", "")
  106
  107    def set_config(self, config: dict) -> None:
  108        """
  109        The set_config function takes a config object and assigns it as the configuration object for the
  110        class.
  111
  112        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  113        contains configuration settings for the class. When you call the `set_config` function with a
  114        dictionary object as the argument, it will set that dictionary as the configuration object for
  115        the class
  116        :type config: dict
  117        """
  118
  119        self.config = config
  120
  121    def set_param(self, param: dict) -> None:
  122        """
  123        This function sets a parameter object for the class based on the input dictionary.
  124
  125        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  126        as the `param` attribute of the class instance
  127        :type param: dict
  128        """
  129
  130        self.param = param
  131
  132    def init_variables(self) -> None:
  133        """
  134        This function initializes the variables that will be used in the rest of the class
  135        """
  136
  137        self.prefix = "howard"
  138        self.table_variants = "variants"
  139        self.dataframe = None
  140
  141        self.comparison_map = {
  142            "gt": ">",
  143            "gte": ">=",
  144            "lt": "<",
  145            "lte": "<=",
  146            "equals": "=",
  147            "contains": "SIMILAR TO",
  148        }
  149
  150        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  151
  152        self.code_type_map_to_sql = {
  153            "Integer": "INTEGER",
  154            "String": "VARCHAR",
  155            "Float": "FLOAT",
  156            "Flag": "VARCHAR",
  157        }
  158
  159        self.index_additionnal_fields = []
  160
  161    def get_indexing(self) -> bool:
  162        """
  163        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  164        returns False.
  165        :return: The value of the indexing parameter.
  166        """
  167
  168        return self.get_param().get("indexing", False)
  169
  170    def get_connexion_config(self) -> dict:
  171        """
  172        The function `get_connexion_config` returns a dictionary containing the configuration for a
  173        connection, including the number of threads and memory limit.
  174        :return: a dictionary containing the configuration for the Connexion library.
  175        """
  176
  177        # config
  178        config = self.get_config()
  179
  180        # Connexion config
  181        connexion_config = {}
  182        threads = self.get_threads()
  183
  184        # Threads
  185        if threads:
  186            connexion_config["threads"] = threads
  187
  188        # Memory
  189        # if config.get("memory", None):
  190        #     connexion_config["memory_limit"] = config.get("memory")
  191        if self.get_memory():
  192            connexion_config["memory_limit"] = self.get_memory()
  193
  194        # Temporary directory
  195        if config.get("tmp", None):
  196            connexion_config["temp_directory"] = config.get("tmp")
  197
  198        # Access
  199        if config.get("access", None):
  200            access = config.get("access")
  201            if access in ["RO"]:
  202                access = "READ_ONLY"
  203            elif access in ["RW"]:
  204                access = "READ_WRITE"
  205            connexion_db = self.get_connexion_db()
  206            if connexion_db in ":memory:":
  207                access = "READ_WRITE"
  208            connexion_config["access_mode"] = access
  209
  210        return connexion_config
  211
  212    def get_duckdb_settings(self) -> dict:
  213        """
  214        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  215        string.
  216        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  217        """
  218
  219        # config
  220        config = self.get_config()
  221
  222        # duckdb settings
  223        duckdb_settings_dict = {}
  224        if config.get("duckdb_settings", None):
  225            duckdb_settings = config.get("duckdb_settings")
  226            duckdb_settings = full_path(duckdb_settings)
  227            # duckdb setting is a file
  228            if os.path.exists(duckdb_settings):
  229                with open(duckdb_settings) as json_file:
  230                    duckdb_settings_dict = yaml.safe_load(json_file)
  231            # duckdb settings is a string
  232            else:
  233                duckdb_settings_dict = json.loads(duckdb_settings)
  234
  235        return duckdb_settings_dict
  236
  237    def set_connexion_db(self) -> str:
  238        """
  239        The function `set_connexion_db` returns the appropriate database connection string based on the
  240        input format and connection type.
  241        :return: the value of the variable `connexion_db`.
  242        """
  243
  244        # Default connexion db
  245        default_connexion_db = ":memory:"
  246
  247        # Find connexion db
  248        if self.get_input_format() in ["db", "duckdb"]:
  249            connexion_db = self.get_input()
  250        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  251            connexion_db = default_connexion_db
  252        elif self.get_connexion_type() in ["tmpfile"]:
  253            tmp_name = tempfile.mkdtemp(
  254                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  255            )
  256            connexion_db = f"{tmp_name}/tmp.db"
  257        elif self.get_connexion_type() != "":
  258            connexion_db = self.get_connexion_type()
  259        else:
  260            connexion_db = default_connexion_db
  261
  262        # Set connexion db
  263        self.connexion_db = connexion_db
  264
  265        return connexion_db
  266
  267    def set_connexion(self, conn) -> None:
  268        """
  269        The function `set_connexion` creates a connection to a database, with options for different
  270        database formats and settings.
  271
  272        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  273        database. If a connection is not provided, a new connection to an in-memory database is created.
  274        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  275        sqlite
  276        """
  277
  278        # Connexion db
  279        connexion_db = self.set_connexion_db()
  280
  281        # Connexion config
  282        connexion_config = self.get_connexion_config()
  283
  284        # Connexion format
  285        connexion_format = self.get_config().get("connexion_format", "duckdb")
  286        # Set connexion format
  287        self.connexion_format = connexion_format
  288
  289        # Connexion
  290        if not conn:
  291            if connexion_format in ["duckdb"]:
  292                conn = duckdb.connect(connexion_db, config=connexion_config)
  293                # duckDB settings
  294                duckdb_settings = self.get_duckdb_settings()
  295                if duckdb_settings:
  296                    for setting in duckdb_settings:
  297                        setting_value = duckdb_settings.get(setting)
  298                        if isinstance(setting_value, str):
  299                            setting_value = f"'{setting_value}'"
  300                        conn.execute(f"PRAGMA {setting}={setting_value};")
  301            elif connexion_format in ["sqlite"]:
  302                conn = sqlite3.connect(connexion_db)
  303
  304        # Set connexion
  305        self.conn = conn
  306
  307        # Log
  308        log.debug(f"connexion_format: {connexion_format}")
  309        log.debug(f"connexion_db: {connexion_db}")
  310        log.debug(f"connexion config: {connexion_config}")
  311        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  312
  313    def set_output(self, output: str = None) -> None:
  314        """
  315        The `set_output` function in Python sets the output file based on the input or a specified key
  316        in the config file, extracting the output name, extension, and format.
  317
  318        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  319        the output file. If the config file has an 'output' key, the method sets the output to the value
  320        of that key. If no output is provided, it sets the output to `None`
  321        :type output: str
  322        """
  323
  324        if output and not isinstance(output, str):
  325            self.output = output.name
  326        else:
  327            self.output = output
  328
  329        # Output format
  330        if self.output:
  331            output_name, output_extension = os.path.splitext(self.output)
  332            self.output_name = output_name
  333            self.output_extension = output_extension
  334            self.output_format = self.output_extension.replace(".", "")
  335        else:
  336            self.output_name = None
  337            self.output_extension = None
  338            self.output_format = None
  339
  340    def set_header(self) -> None:
  341        """
  342        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  343        """
  344
  345        input_file = self.get_input()
  346        default_header_list = [
  347            "##fileformat=VCFv4.2",
  348            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  349        ]
  350
  351        # Full path
  352        input_file = full_path(input_file)
  353
  354        if input_file:
  355
  356            input_format = self.get_input_format()
  357            input_compressed = self.get_input_compressed()
  358            config = self.get_config()
  359            header_list = default_header_list
  360            if input_format in [
  361                "vcf",
  362                "hdr",
  363                "tsv",
  364                "csv",
  365                "psv",
  366                "parquet",
  367                "db",
  368                "duckdb",
  369            ]:
  370                # header provided in param
  371                if config.get("header_file", None):
  372                    with open(config.get("header_file"), "rt") as f:
  373                        header_list = self.read_vcf_header(f)
  374                # within a vcf file format (header within input file itsself)
  375                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  376                    # within a compressed vcf file format (.vcf.gz)
  377                    if input_compressed:
  378                        with bgzf.open(input_file, "rt") as f:
  379                            header_list = self.read_vcf_header(f)
  380                    # within an uncompressed vcf file format (.vcf)
  381                    else:
  382                        with open(input_file, "rt") as f:
  383                            header_list = self.read_vcf_header(f)
  384                # header provided in default external file .hdr
  385                elif os.path.exists((input_file + ".hdr")):
  386                    with open(input_file + ".hdr", "rt") as f:
  387                        header_list = self.read_vcf_header(f)
  388                else:
  389                    try:  # Try to get header info fields and file columns
  390
  391                        with tempfile.TemporaryDirectory() as tmpdir:
  392
  393                            # Create database
  394                            db_for_header = Database(database=input_file)
  395
  396                            # Get header columns for infos fields
  397                            db_header_from_columns = (
  398                                db_for_header.get_header_from_columns()
  399                            )
  400
  401                            # Get real columns in the file
  402                            db_header_columns = db_for_header.get_columns()
  403
  404                            # Write header file
  405                            header_file_tmp = os.path.join(tmpdir, "header")
  406                            f = open(header_file_tmp, "w")
  407                            vcf.Writer(f, db_header_from_columns)
  408                            f.close()
  409
  410                            # Replace #CHROM line with rel columns
  411                            header_list = db_for_header.read_header_file(
  412                                header_file=header_file_tmp
  413                            )
  414                            header_list[-1] = "\t".join(db_header_columns)
  415
  416                    except:
  417
  418                        log.warning(
  419                            f"No header for file {input_file}. Set as default VCF header"
  420                        )
  421                        header_list = default_header_list
  422
  423            else:  # try for unknown format ?
  424
  425                log.error(f"Input file format '{input_format}' not available")
  426                raise ValueError(f"Input file format '{input_format}' not available")
  427
  428            if not header_list:
  429                header_list = default_header_list
  430
  431            # header as list
  432            self.header_list = header_list
  433
  434            # header as VCF object
  435            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  436
  437        else:
  438
  439            self.header_list = None
  440            self.header_vcf = None
  441
  442    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  443        """
  444        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  445        DataFrame based on the connection format.
  446
  447        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  448        represents the SQL query you want to execute. This query will be used to fetch data from a
  449        database and convert it into a pandas DataFrame
  450        :type query: str
  451        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  452        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  453        function will only fetch up to that number of rows from the database query result. If no limit
  454        is specified,
  455        :type limit: int
  456        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  457        """
  458
  459        # Connexion format
  460        connexion_format = self.get_connexion_format()
  461
  462        # Limit in query
  463        if limit:
  464            pd.set_option("display.max_rows", limit)
  465            if connexion_format in ["duckdb"]:
  466                df = (
  467                    self.conn.execute(query)
  468                    .fetch_record_batch(limit)
  469                    .read_next_batch()
  470                    .to_pandas()
  471                )
  472            elif connexion_format in ["sqlite"]:
  473                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  474
  475        # Full query
  476        else:
  477            if connexion_format in ["duckdb"]:
  478                df = self.conn.execute(query).df()
  479            elif connexion_format in ["sqlite"]:
  480                df = pd.read_sql_query(query, self.conn)
  481
  482        return df
  483
  484    def get_overview(self) -> None:
  485        """
  486        The function prints the input, output, config, and dataframe of the current object
  487        """
  488        table_variants_from = self.get_table_variants(clause="from")
  489        sql_columns = self.get_header_columns_as_sql()
  490        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  491        df = self.get_query_to_df(sql_query_export)
  492        log.info(
  493            "Input:  "
  494            + str(self.get_input())
  495            + " ["
  496            + str(str(self.get_input_format()))
  497            + "]"
  498        )
  499        log.info(
  500            "Output: "
  501            + str(self.get_output())
  502            + " ["
  503            + str(str(self.get_output_format()))
  504            + "]"
  505        )
  506        log.info("Config: ")
  507        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  508            "\n"
  509        ):
  510            log.info("\t" + str(d))
  511        log.info("Param: ")
  512        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  513            "\n"
  514        ):
  515            log.info("\t" + str(d))
  516        log.info("Sample list: " + str(self.get_header_sample_list()))
  517        log.info("Dataframe: ")
  518        for d in str(df).split("\n"):
  519            log.info("\t" + str(d))
  520
  521        # garbage collector
  522        del df
  523        gc.collect()
  524
  525        return None
  526
  527    def get_stats(self) -> dict:
  528        """
  529        The `get_stats` function calculates and returns various statistics of the current object,
  530        including information about the input file, variants, samples, header fields, quality, and
  531        SNVs/InDels.
  532        :return: a dictionary containing various statistics of the current object. The dictionary has
  533        the following structure:
  534        """
  535
  536        # Log
  537        log.info(f"Stats Calculation...")
  538
  539        # table varaints
  540        table_variants_from = self.get_table_variants()
  541
  542        # stats dict
  543        stats = {"Infos": {}}
  544
  545        ### File
  546        input_file = self.get_input()
  547        stats["Infos"]["Input file"] = input_file
  548
  549        # Header
  550        header_infos = self.get_header().infos
  551        header_formats = self.get_header().formats
  552        header_infos_list = list(header_infos)
  553        header_formats_list = list(header_formats)
  554
  555        ### Variants
  556
  557        stats["Variants"] = {}
  558
  559        # Variants by chr
  560        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  561        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  562        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  563            by=["CHROM"], kind="quicksort"
  564        )
  565
  566        # Total number of variants
  567        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  568
  569        # Calculate percentage
  570        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  571            lambda x: (x / nb_of_variants)
  572        )
  573
  574        stats["Variants"]["Number of variants by chromosome"] = (
  575            nb_of_variants_by_chrom.to_dict(orient="index")
  576        )
  577
  578        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  579
  580        ### Samples
  581
  582        # Init
  583        samples = {}
  584        nb_of_samples = 0
  585
  586        # Check Samples
  587        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  588            log.debug(f"Check samples...")
  589            for sample in self.get_header_sample_list():
  590                sql_query_samples = f"""
  591                    SELECT  '{sample}' as sample,
  592                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  593                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  594                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  595                    FROM {table_variants_from}
  596                    WHERE (
  597                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  598                        AND
  599                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  600                      )
  601                    GROUP BY genotype
  602                    """
  603                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  604                sample_genotype_count = sql_query_genotype_df["count"].sum()
  605                if len(sql_query_genotype_df):
  606                    nb_of_samples += 1
  607                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  608                        sql_query_genotype_df.to_dict(orient="index")
  609                    )
  610
  611            stats["Samples"] = samples
  612            stats["Infos"]["Number of samples"] = nb_of_samples
  613
  614        # #
  615        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  616        #     stats["Infos"]["Number of samples"] = nb_of_samples
  617        # elif nb_of_samples:
  618        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  619
  620        ### INFO and FORMAT fields
  621        header_types_df = {}
  622        header_types_list = {
  623            "List of INFO fields": header_infos,
  624            "List of FORMAT fields": header_formats,
  625        }
  626        i = 0
  627        for header_type in header_types_list:
  628
  629            header_type_infos = header_types_list.get(header_type)
  630            header_infos_dict = {}
  631
  632            for info in header_type_infos:
  633
  634                i += 1
  635                header_infos_dict[i] = {}
  636
  637                # ID
  638                header_infos_dict[i]["id"] = info
  639
  640                # num
  641                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  642                if header_type_infos[info].num in genotype_map.keys():
  643                    header_infos_dict[i]["Number"] = genotype_map.get(
  644                        header_type_infos[info].num
  645                    )
  646                else:
  647                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  648
  649                # type
  650                if header_type_infos[info].type:
  651                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  652                else:
  653                    header_infos_dict[i]["Type"] = "."
  654
  655                # desc
  656                if header_type_infos[info].desc != None:
  657                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  658                else:
  659                    header_infos_dict[i]["Description"] = ""
  660
  661            if len(header_infos_dict):
  662                header_types_df[header_type] = pd.DataFrame.from_dict(
  663                    header_infos_dict, orient="index"
  664                ).to_dict(orient="index")
  665
  666        # Stats
  667        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  668        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  669        stats["Header"] = header_types_df
  670
  671        ### QUAL
  672        if "QUAL" in self.get_header_columns():
  673            sql_query_qual = f"""
  674                    SELECT
  675                        avg(CAST(QUAL AS INTEGER)) AS Average,
  676                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  677                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  678                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  679                        median(CAST(QUAL AS INTEGER)) AS Median,
  680                        variance(CAST(QUAL AS INTEGER)) AS Variance
  681                    FROM {table_variants_from}
  682                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  683                    """
  684
  685            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  686            stats["Quality"] = {"Stats": qual}
  687
  688        ### SNV and InDel
  689
  690        sql_query_snv = f"""
  691            
  692            SELECT Type, count FROM (
  693
  694                    SELECT
  695                        'Total' AS Type,
  696                        count(*) AS count
  697                    FROM {table_variants_from}
  698
  699                    UNION
  700
  701                    SELECT
  702                        'MNV' AS Type,
  703                        count(*) AS count
  704                    FROM {table_variants_from}
  705                    WHERE len(REF) > 1 AND len(ALT) > 1
  706                    AND len(REF) = len(ALT)
  707
  708                    UNION
  709
  710                    SELECT
  711                        'InDel' AS Type,
  712                        count(*) AS count
  713                    FROM {table_variants_from}
  714                    WHERE len(REF) > 1 OR len(ALT) > 1
  715                    AND len(REF) != len(ALT)
  716                    
  717                    UNION
  718
  719                    SELECT
  720                        'SNV' AS Type,
  721                        count(*) AS count
  722                    FROM {table_variants_from}
  723                    WHERE len(REF) = 1 AND len(ALT) = 1
  724
  725                )
  726
  727            ORDER BY count DESC
  728
  729                """
  730        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  731
  732        sql_query_snv_substitution = f"""
  733                SELECT
  734                    concat(REF, '>', ALT) AS 'Substitution',
  735                    count(*) AS count
  736                FROM {table_variants_from}
  737                WHERE len(REF) = 1 AND len(ALT) = 1
  738                GROUP BY REF, ALT
  739                ORDER BY count(*) DESC
  740                """
  741        snv_substitution = (
  742            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  743        )
  744        stats["Variants"]["Counts"] = snv_indel
  745        stats["Variants"]["Substitutions"] = snv_substitution
  746
  747        return stats
  748
  749    def stats_to_file(self, file: str = None) -> str:
  750        """
  751        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  752        into a JSON object, and writes the JSON object to the specified file.
  753
  754        :param file: The `file` parameter is a string that represents the file path where the JSON data
  755        will be written
  756        :type file: str
  757        :return: the name of the file that was written to.
  758        """
  759
  760        # Get stats
  761        stats = self.get_stats()
  762
  763        # Serializing json
  764        json_object = json.dumps(stats, indent=4)
  765
  766        # Writing to sample.json
  767        with open(file, "w") as outfile:
  768            outfile.write(json_object)
  769
  770        return file
  771
  772    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  773        """
  774        The `print_stats` function generates a markdown file and prints the statistics contained in a
  775        JSON file in a formatted manner.
  776
  777        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  778        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  779        provided, a temporary directory will be created and the stats will be saved in a file named
  780        "stats.md" within that
  781        :type output_file: str
  782        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  783        file where the statistics will be saved. If no value is provided, a temporary directory will be
  784        created and a default file name "stats.json" will be used
  785        :type json_file: str
  786        :return: The function `print_stats` does not return any value. It has a return type annotation
  787        of `None`.
  788        """
  789
  790        # Full path
  791        output_file = full_path(output_file)
  792        json_file = full_path(json_file)
  793
  794        with tempfile.TemporaryDirectory() as tmpdir:
  795
  796            # Files
  797            if not output_file:
  798                output_file = os.path.join(tmpdir, "stats.md")
  799            if not json_file:
  800                json_file = os.path.join(tmpdir, "stats.json")
  801
  802            # Create folders
  803            if not os.path.exists(os.path.dirname(output_file)):
  804                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  805            if not os.path.exists(os.path.dirname(json_file)):
  806                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  807
  808            # Create stats JSON file
  809            stats_file = self.stats_to_file(file=json_file)
  810
  811            # Print stats file
  812            with open(stats_file) as f:
  813                stats = yaml.safe_load(f)
  814
  815            # Output
  816            output_title = []
  817            output_index = []
  818            output = []
  819
  820            # Title
  821            output_title.append("# HOWARD Stats")
  822
  823            # Index
  824            output_index.append("## Index")
  825
  826            # Process sections
  827            for section in stats:
  828                infos = stats.get(section)
  829                section_link = "#" + section.lower().replace(" ", "-")
  830                output.append(f"## {section}")
  831                output_index.append(f"- [{section}]({section_link})")
  832
  833                if len(infos):
  834                    for info in infos:
  835                        try:
  836                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  837                            is_df = True
  838                        except:
  839                            try:
  840                                df = pd.DataFrame.from_dict(
  841                                    json.loads((infos.get(info))), orient="index"
  842                                )
  843                                is_df = True
  844                            except:
  845                                is_df = False
  846                        if is_df:
  847                            output.append(f"### {info}")
  848                            info_link = "#" + info.lower().replace(" ", "-")
  849                            output_index.append(f"   - [{info}]({info_link})")
  850                            output.append(f"{df.to_markdown(index=False)}")
  851                        else:
  852                            output.append(f"- {info}: {infos.get(info)}")
  853                else:
  854                    output.append(f"NA")
  855
  856            # Write stats in markdown file
  857            with open(output_file, "w") as fp:
  858                for item in output_title:
  859                    fp.write("%s\n" % item)
  860                for item in output_index:
  861                    fp.write("%s\n" % item)
  862                for item in output:
  863                    fp.write("%s\n" % item)
  864
  865            # Output stats in markdown
  866            print("")
  867            print("\n\n".join(output_title))
  868            print("")
  869            print("\n\n".join(output))
  870            print("")
  871
  872        return None
  873
  874    def get_input(self) -> str:
  875        """
  876        It returns the value of the input variable.
  877        :return: The input is being returned.
  878        """
  879        return self.input
  880
  881    def get_input_format(self, input_file: str = None) -> str:
  882        """
  883        This function returns the format of the input variable, either from the provided input file or
  884        by prompting for input.
  885
  886        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  887        represents the file path of the input file. If no `input_file` is provided when calling the
  888        method, it will default to `None`
  889        :type input_file: str
  890        :return: The format of the input variable is being returned.
  891        """
  892
  893        if not input_file:
  894            input_file = self.get_input()
  895        input_format = get_file_format(input_file)
  896        return input_format
  897
  898    def get_input_compressed(self, input_file: str = None) -> str:
  899        """
  900        The function `get_input_compressed` returns the format of the input variable after compressing
  901        it.
  902
  903        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  904        that represents the file path of the input file. If no `input_file` is provided when calling the
  905        method, it will default to `None` and the method will then call `self.get_input()` to
  906        :type input_file: str
  907        :return: The function `get_input_compressed` returns the compressed format of the input
  908        variable.
  909        """
  910
  911        if not input_file:
  912            input_file = self.get_input()
  913        input_compressed = get_file_compressed(input_file)
  914        return input_compressed
  915
  916    def get_output(self) -> str:
  917        """
  918        It returns the output of the neuron.
  919        :return: The output of the neural network.
  920        """
  921
  922        return self.output
  923
  924    def get_output_format(self, output_file: str = None) -> str:
  925        """
  926        The function `get_output_format` returns the format of the input variable or the output file if
  927        provided.
  928
  929        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  930        that represents the file path of the output file. If no `output_file` is provided when calling
  931        the method, it will default to the output obtained from the `get_output` method of the class
  932        instance. The
  933        :type output_file: str
  934        :return: The format of the input variable is being returned.
  935        """
  936
  937        if not output_file:
  938            output_file = self.get_output()
  939        output_format = get_file_format(output_file)
  940
  941        return output_format
  942
  943    def get_config(self) -> dict:
  944        """
  945        It returns the config
  946        :return: The config variable is being returned.
  947        """
  948        return self.config
  949
  950    def get_param(self) -> dict:
  951        """
  952        It returns the param
  953        :return: The param variable is being returned.
  954        """
  955        return self.param
  956
  957    def get_connexion_db(self) -> str:
  958        """
  959        It returns the connexion_db attribute of the object
  960        :return: The connexion_db is being returned.
  961        """
  962        return self.connexion_db
  963
  964    def get_prefix(self) -> str:
  965        """
  966        It returns the prefix of the object.
  967        :return: The prefix is being returned.
  968        """
  969        return self.prefix
  970
  971    def get_table_variants(self, clause: str = "select") -> str:
  972        """
  973        This function returns the table_variants attribute of the object
  974
  975        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
  976        defaults to select (optional)
  977        :return: The table_variants attribute of the object.
  978        """
  979
  980        # Access
  981        access = self.get_config().get("access", None)
  982
  983        # Clauses "select", "where", "update"
  984        if clause in ["select", "where", "update"]:
  985            table_variants = self.table_variants
  986        # Clause "from"
  987        elif clause in ["from"]:
  988            # For Read Only
  989            if self.get_input_format() in ["parquet"] and access in ["RO"]:
  990                input_file = self.get_input()
  991                table_variants = f"'{input_file}' as variants"
  992            # For Read Write
  993            else:
  994                table_variants = f"{self.table_variants} as variants"
  995        else:
  996            table_variants = self.table_variants
  997        return table_variants
  998
  999    def get_tmp_dir(self) -> str:
 1000        """
 1001        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1002        parameters or a default path.
 1003        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1004        configuration, parameters, and a default value of "/tmp".
 1005        """
 1006
 1007        return get_tmp(
 1008            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1009        )
 1010
 1011    def get_connexion_type(self) -> str:
 1012        """
 1013        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1014
 1015        :return: The connexion type is being returned.
 1016        """
 1017        return self.get_config().get("connexion_type", "memory")
 1018
 1019    def get_connexion(self):
 1020        """
 1021        It returns the connection object
 1022
 1023        :return: The connection object.
 1024        """
 1025        return self.conn
 1026
 1027    def close_connexion(self) -> None:
 1028        """
 1029        This function closes the connection to the database.
 1030        :return: The connection is being closed.
 1031        """
 1032        return self.conn.close()
 1033
 1034    def get_header(self, type: str = "vcf"):
 1035        """
 1036        This function returns the header of the VCF file as a list of strings
 1037
 1038        :param type: the type of header you want to get, defaults to vcf (optional)
 1039        :return: The header of the vcf file.
 1040        """
 1041
 1042        if self.header_vcf:
 1043            if type == "vcf":
 1044                return self.header_vcf
 1045            elif type == "list":
 1046                return self.header_list
 1047        else:
 1048            if type == "vcf":
 1049                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1050                return header
 1051            elif type == "list":
 1052                return vcf_required
 1053
 1054    def get_header_length(self, file: str = None) -> int:
 1055        """
 1056        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1057        line.
 1058
 1059        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1060        header file. If this argument is provided, the function will read the header from the specified
 1061        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1062        :type file: str
 1063        :return: the length of the header list, excluding the #CHROM line.
 1064        """
 1065
 1066        if file:
 1067            return len(self.read_vcf_header_file(file=file)) - 1
 1068        elif self.get_header(type="list"):
 1069            return len(self.get_header(type="list")) - 1
 1070        else:
 1071            return 0
 1072
 1073    def get_header_columns(self) -> str:
 1074        """
 1075        This function returns the header list of a VCF
 1076
 1077        :return: The length of the header list.
 1078        """
 1079        if self.get_header():
 1080            return self.get_header(type="list")[-1]
 1081        else:
 1082            return ""
 1083
 1084    def get_header_columns_as_list(self) -> list:
 1085        """
 1086        This function returns the header list of a VCF
 1087
 1088        :return: The length of the header list.
 1089        """
 1090        if self.get_header():
 1091            return self.get_header_columns().strip().split("\t")
 1092        else:
 1093            return []
 1094
 1095    def get_header_columns_as_sql(self) -> str:
 1096        """
 1097        This function retruns header length (without #CHROM line)
 1098
 1099        :return: The length of the header list.
 1100        """
 1101        sql_column_list = []
 1102        for col in self.get_header_columns_as_list():
 1103            sql_column_list.append(f'"{col}"')
 1104        return ",".join(sql_column_list)
 1105
 1106    def get_header_sample_list(self) -> list:
 1107        """
 1108        This function retruns header length (without #CHROM line)
 1109
 1110        :return: The length of the header list.
 1111        """
 1112        return self.header_vcf.samples
 1113
 1114    def get_verbose(self) -> bool:
 1115        """
 1116        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1117        exist
 1118
 1119        :return: The value of the key "verbose" in the config dictionary.
 1120        """
 1121        return self.get_config().get("verbose", False)
 1122
 1123    def get_connexion_format(self) -> str:
 1124        """
 1125        It returns the connexion format of the object.
 1126        :return: The connexion_format is being returned.
 1127        """
 1128        connexion_format = self.connexion_format
 1129        if connexion_format not in ["duckdb", "sqlite"]:
 1130            log.error(f"Unknown connexion format {connexion_format}")
 1131            raise ValueError(f"Unknown connexion format {connexion_format}")
 1132        else:
 1133            return connexion_format
 1134
 1135    def insert_file_to_table(
 1136        self,
 1137        file,
 1138        columns: str,
 1139        header_len: int = 0,
 1140        sep: str = "\t",
 1141        chunksize: int = 1000000,
 1142    ) -> None:
 1143        """
 1144        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1145        database format.
 1146
 1147        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1148        the path to the file on your system
 1149        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1150        should contain the names of the columns in the table where the data will be inserted. The column
 1151        names should be separated by commas within the string. For example, if you have columns named
 1152        "id", "name
 1153        :type columns: str
 1154        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1155        the number of lines to skip at the beginning of the file before reading the actual data. This
 1156        parameter allows you to skip any header information present in the file before processing the
 1157        data, defaults to 0
 1158        :type header_len: int (optional)
 1159        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1160        separator character that is used in the file being read. In this case, the default separator is
 1161        set to `\t`, which represents a tab character. You can change this parameter to a different
 1162        separator character if, defaults to \t
 1163        :type sep: str (optional)
 1164        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1165        when processing the file in chunks. In the provided code snippet, the default value for
 1166        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1167        to 1000000
 1168        :type chunksize: int (optional)
 1169        """
 1170
 1171        # Config
 1172        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1173        connexion_format = self.get_connexion_format()
 1174
 1175        log.debug("chunksize: " + str(chunksize))
 1176
 1177        if chunksize:
 1178            for chunk in pd.read_csv(
 1179                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1180            ):
 1181                if connexion_format in ["duckdb"]:
 1182                    sql_insert_into = (
 1183                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1184                    )
 1185                    self.conn.execute(sql_insert_into)
 1186                elif connexion_format in ["sqlite"]:
 1187                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1188
 1189    def load_data(
 1190        self,
 1191        input_file: str = None,
 1192        drop_variants_table: bool = False,
 1193        sample_size: int = 20480,
 1194    ) -> None:
 1195        """
 1196        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1197        table before loading the data and specify a sample size.
 1198
 1199        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1200        table
 1201        :type input_file: str
 1202        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1203        determines whether the variants table should be dropped before loading the data. If set to
 1204        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1205        not be dropped, defaults to False
 1206        :type drop_variants_table: bool (optional)
 1207        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1208        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1209        20480
 1210        :type sample_size: int (optional)
 1211        """
 1212
 1213        log.info("Loading...")
 1214
 1215        # change input file
 1216        if input_file:
 1217            self.set_input(input_file)
 1218            self.set_header()
 1219
 1220        # drop variants table
 1221        if drop_variants_table:
 1222            self.drop_variants_table()
 1223
 1224        # get table variants
 1225        table_variants = self.get_table_variants()
 1226
 1227        # Access
 1228        access = self.get_config().get("access", None)
 1229        log.debug(f"access: {access}")
 1230
 1231        # Input format and compress
 1232        input_format = self.get_input_format()
 1233        input_compressed = self.get_input_compressed()
 1234        log.debug(f"input_format: {input_format}")
 1235        log.debug(f"input_compressed: {input_compressed}")
 1236
 1237        # input_compressed_format
 1238        if input_compressed:
 1239            input_compressed_format = "gzip"
 1240        else:
 1241            input_compressed_format = "none"
 1242        log.debug(f"input_compressed_format: {input_compressed_format}")
 1243
 1244        # Connexion format
 1245        connexion_format = self.get_connexion_format()
 1246
 1247        # Sample size
 1248        if not sample_size:
 1249            sample_size = -1
 1250        log.debug(f"sample_size: {sample_size}")
 1251
 1252        # Load data
 1253        log.debug(f"Load Data from {input_format}")
 1254
 1255        # DuckDB connexion
 1256        if connexion_format in ["duckdb"]:
 1257
 1258            # Database already exists
 1259            if self.input_format in ["db", "duckdb"]:
 1260
 1261                if connexion_format in ["duckdb"]:
 1262                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1263                else:
 1264                    log.error(
 1265                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1266                    )
 1267                    raise ValueError(
 1268                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1269                    )
 1270
 1271            # Load from existing database format
 1272            else:
 1273
 1274                try:
 1275                    # Create Table or View
 1276                    database = Database(database=self.input)
 1277                    sql_from = database.get_sql_from(sample_size=sample_size)
 1278
 1279                    if access in ["RO"]:
 1280                        sql_load = (
 1281                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1282                        )
 1283                    else:
 1284                        sql_load = (
 1285                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1286                        )
 1287                    self.conn.execute(sql_load)
 1288
 1289                except:
 1290                    # Format not available
 1291                    log.error(f"Input file format '{self.input_format}' not available")
 1292                    raise ValueError(
 1293                        f"Input file format '{self.input_format}' not available"
 1294                    )
 1295
 1296        # SQLite connexion
 1297        elif connexion_format in ["sqlite"] and input_format in [
 1298            "vcf",
 1299            "tsv",
 1300            "csv",
 1301            "psv",
 1302        ]:
 1303
 1304            # Main structure
 1305            structure = {
 1306                "#CHROM": "VARCHAR",
 1307                "POS": "INTEGER",
 1308                "ID": "VARCHAR",
 1309                "REF": "VARCHAR",
 1310                "ALT": "VARCHAR",
 1311                "QUAL": "VARCHAR",
 1312                "FILTER": "VARCHAR",
 1313                "INFO": "VARCHAR",
 1314            }
 1315
 1316            # Strcuture with samples
 1317            structure_complete = structure
 1318            if self.get_header_sample_list():
 1319                structure["FORMAT"] = "VARCHAR"
 1320                for sample in self.get_header_sample_list():
 1321                    structure_complete[sample] = "VARCHAR"
 1322
 1323            # Columns list for create and insert
 1324            sql_create_table_columns = []
 1325            sql_create_table_columns_list = []
 1326            for column in structure_complete:
 1327                column_type = structure_complete[column]
 1328                sql_create_table_columns.append(
 1329                    f'"{column}" {column_type} default NULL'
 1330                )
 1331                sql_create_table_columns_list.append(f'"{column}"')
 1332
 1333            # Create database
 1334            log.debug(f"Create Table {table_variants}")
 1335            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1336            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1337            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1338            self.conn.execute(sql_create_table)
 1339
 1340            # chunksize define length of file chunk load file
 1341            chunksize = 100000
 1342
 1343            # delimiter
 1344            delimiter = file_format_delimiters.get(input_format, "\t")
 1345
 1346            # Load the input file
 1347            with open(self.input, "rt") as input_file:
 1348
 1349                # Use the appropriate file handler based on the input format
 1350                if input_compressed:
 1351                    input_file = bgzf.open(self.input, "rt")
 1352                if input_format in ["vcf"]:
 1353                    header_len = self.get_header_length()
 1354                else:
 1355                    header_len = 0
 1356
 1357                # Insert the file contents into a table
 1358                self.insert_file_to_table(
 1359                    input_file,
 1360                    columns=sql_create_table_columns_list_sql,
 1361                    header_len=header_len,
 1362                    sep=delimiter,
 1363                    chunksize=chunksize,
 1364                )
 1365
 1366        else:
 1367            log.error(
 1368                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1369            )
 1370            raise ValueError(
 1371                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1372            )
 1373
 1374        # Explode INFOS fields into table fields
 1375        if self.get_explode_infos():
 1376            self.explode_infos(
 1377                prefix=self.get_explode_infos_prefix(),
 1378                fields=self.get_explode_infos_fields(),
 1379                force=True,
 1380            )
 1381
 1382        # Create index after insertion
 1383        self.create_indexes()
 1384
 1385    def get_explode_infos(self) -> bool:
 1386        """
 1387        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1388        to False if it is not set.
 1389        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1390        value. If the parameter is not present, it will return False.
 1391        """
 1392
 1393        return self.get_param().get("explode", {}).get("explode_infos", False)
 1394
 1395    def get_explode_infos_fields(
 1396        self,
 1397        explode_infos_fields: str = None,
 1398        remove_fields_not_in_header: bool = False,
 1399    ) -> list:
 1400        """
 1401        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1402        the input parameter `explode_infos_fields`.
 1403
 1404        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1405        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1406        comma-separated list of field names to explode
 1407        :type explode_infos_fields: str
 1408        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1409        flag that determines whether to remove fields that are not present in the header. If it is set
 1410        to `True`, any field that is not in the header will be excluded from the list of exploded
 1411        information fields. If it is set to `, defaults to False
 1412        :type remove_fields_not_in_header: bool (optional)
 1413        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1414        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1415        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1416        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1417        splitting the string by commas.
 1418        """
 1419
 1420        # If no fields, get it in param
 1421        if not explode_infos_fields:
 1422            explode_infos_fields = (
 1423                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1424            )
 1425
 1426        # If no fields, defined as all fields in header using keyword
 1427        if not explode_infos_fields:
 1428            explode_infos_fields = "*"
 1429
 1430        # If fields list not empty
 1431        if explode_infos_fields:
 1432
 1433            # Input fields list
 1434            if isinstance(explode_infos_fields, str):
 1435                fields_input = explode_infos_fields.split(",")
 1436            elif isinstance(explode_infos_fields, list):
 1437                fields_input = explode_infos_fields
 1438            else:
 1439                fields_input = []
 1440
 1441            # Fields list without * keyword
 1442            fields_without_all = fields_input.copy()
 1443            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1444                fields_without_all.remove("*")
 1445
 1446            # Fields in header
 1447            fields_in_header = sorted(list(set(self.get_header().infos)))
 1448
 1449            # Construct list of fields
 1450            fields_output = []
 1451            for field in fields_input:
 1452
 1453                # Strip field
 1454                field = field.strip()
 1455
 1456                # format keyword * in regex
 1457                if field.upper() in ["*"]:
 1458                    field = ".*"
 1459
 1460                # Find all fields with pattern
 1461                r = re.compile(field)
 1462                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1463
 1464                # Remove fields input from search
 1465                if field in fields_search:
 1466                    fields_search = [field]
 1467                elif fields_search != [field]:
 1468                    fields_search = sorted(
 1469                        list(set(fields_search).difference(fields_input))
 1470                    )
 1471
 1472                # If field is not in header (avoid not well formatted header)
 1473                if not fields_search and not remove_fields_not_in_header:
 1474                    fields_search = [field]
 1475
 1476                # Add found fields
 1477                for new_field in fields_search:
 1478                    # Add field, if not already exists, and if it is in header (if asked)
 1479                    if (
 1480                        new_field not in fields_output
 1481                        and (
 1482                            not remove_fields_not_in_header
 1483                            or new_field in fields_in_header
 1484                        )
 1485                        and new_field not in [".*"]
 1486                    ):
 1487                        fields_output.append(new_field)
 1488
 1489            return fields_output
 1490
 1491        else:
 1492
 1493            return []
 1494
 1495    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1496        """
 1497        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1498        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1499        not provided.
 1500
 1501        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1502        prefix to be used for exploding or expanding information
 1503        :type explode_infos_prefix: str
 1504        :return: the value of the variable `explode_infos_prefix`.
 1505        """
 1506
 1507        if not explode_infos_prefix:
 1508            explode_infos_prefix = (
 1509                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1510            )
 1511
 1512        return explode_infos_prefix
 1513
 1514    def add_column(
 1515        self,
 1516        table_name,
 1517        column_name,
 1518        column_type,
 1519        default_value=None,
 1520        drop: bool = False,
 1521    ) -> dict:
 1522        """
 1523        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1524        doesn't already exist.
 1525
 1526        :param table_name: The name of the table to which you want to add a column
 1527        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1528        to the table
 1529        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1530        want to add to the table. It should be a string that represents the desired data type, such as
 1531        "INTEGER", "TEXT", "REAL", etc
 1532        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1533        default value for the newly added column. If a default value is provided, it will be assigned to
 1534        the column for any existing rows that do not have a value for that column
 1535        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1536        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1537        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1538        to False
 1539        :type drop: bool (optional)
 1540        :return: a boolean value indicating whether the column was successfully added to the table.
 1541        """
 1542
 1543        # added
 1544        added = False
 1545        dropped = False
 1546
 1547        # Check if the column already exists in the table
 1548        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1549        columns = self.get_query_to_df(query).columns.tolist()
 1550        if column_name.upper() in [c.upper() for c in columns]:
 1551            log.debug(
 1552                f"The {column_name} column already exists in the {table_name} table"
 1553            )
 1554            if drop:
 1555                self.drop_column(table_name=table_name, column_name=column_name)
 1556                dropped = True
 1557            else:
 1558                return None
 1559        else:
 1560            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1561
 1562        # Add column in table
 1563        add_column_query = (
 1564            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1565        )
 1566        if default_value is not None:
 1567            add_column_query += f" DEFAULT {default_value}"
 1568        self.execute_query(add_column_query)
 1569        added = not dropped
 1570        log.debug(
 1571            f"The {column_name} column was successfully added to the {table_name} table"
 1572        )
 1573
 1574        if added:
 1575            added_column = {
 1576                "table_name": table_name,
 1577                "column_name": column_name,
 1578                "column_type": column_type,
 1579                "default_value": default_value,
 1580            }
 1581        else:
 1582            added_column = None
 1583
 1584        return added_column
 1585
 1586    def drop_column(
 1587        self, column: dict = None, table_name: str = None, column_name: str = None
 1588    ) -> bool:
 1589        """
 1590        The `drop_column` function drops a specified column from a given table in a database and returns
 1591        True if the column was successfully dropped, and False if the column does not exist in the
 1592        table.
 1593
 1594        :param column: The `column` parameter is a dictionary that contains information about the column
 1595        you want to drop. It has two keys:
 1596        :type column: dict
 1597        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1598        drop a column
 1599        :type table_name: str
 1600        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1601        from the table
 1602        :type column_name: str
 1603        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1604        and False if the column does not exist in the table.
 1605        """
 1606
 1607        # Find column infos
 1608        if column:
 1609            if isinstance(column, dict):
 1610                table_name = column.get("table_name", None)
 1611                column_name = column.get("column_name", None)
 1612            elif isinstance(column, str):
 1613                table_name = self.get_table_variants()
 1614                column_name = column
 1615            else:
 1616                table_name = None
 1617                column_name = None
 1618
 1619        if not table_name and not column_name:
 1620            return False
 1621
 1622        # Removed
 1623        removed = False
 1624
 1625        # Check if the column already exists in the table
 1626        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1627        columns = self.get_query_to_df(query).columns.tolist()
 1628        if column_name in columns:
 1629            log.debug(f"The {column_name} column exists in the {table_name} table")
 1630        else:
 1631            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1632            return False
 1633
 1634        # Add column in table # ALTER TABLE integers DROP k
 1635        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1636        self.execute_query(add_column_query)
 1637        removed = True
 1638        log.debug(
 1639            f"The {column_name} column was successfully dropped to the {table_name} table"
 1640        )
 1641
 1642        return removed
 1643
 1644    def explode_infos(
 1645        self,
 1646        prefix: str = None,
 1647        create_index: bool = False,
 1648        fields: list = None,
 1649        force: bool = False,
 1650        proccess_all_fields_together: bool = False,
 1651        table: str = None,
 1652    ) -> list:
 1653        """
 1654        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1655        individual columns, returning a list of added columns.
 1656
 1657        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1658        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1659        `self.get_explode_infos_prefix()` as the prefix
 1660        :type prefix: str
 1661        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1662        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1663        `False`, indexes will not be created. The default value is `False`, defaults to False
 1664        :type create_index: bool (optional)
 1665        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1666        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1667        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1668        a list to the `
 1669        :type fields: list
 1670        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1671        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1672        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1673        defaults to False
 1674        :type force: bool (optional)
 1675        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1676        flag that determines whether to process all the INFO fields together or individually. If set to
 1677        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1678        be processed individually. The default value is, defaults to False
 1679        :type proccess_all_fields_together: bool (optional)
 1680        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1681        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1682        a value for the `table` parameter, the function will use that table name. If the `table`
 1683        parameter is
 1684        :type table: str
 1685        :return: The `explode_infos` function returns a list of added columns.
 1686        """
 1687
 1688        # drop indexes
 1689        self.drop_indexes()
 1690
 1691        # connexion format
 1692        connexion_format = self.get_connexion_format()
 1693
 1694        # Access
 1695        access = self.get_config().get("access", None)
 1696
 1697        # Added columns
 1698        added_columns = []
 1699
 1700        if access not in ["RO"]:
 1701
 1702            # prefix
 1703            if prefix in [None, True] or not isinstance(prefix, str):
 1704                if self.get_explode_infos_prefix() not in [None, True]:
 1705                    prefix = self.get_explode_infos_prefix()
 1706                else:
 1707                    prefix = "INFO/"
 1708
 1709            # table variants
 1710            if table is not None:
 1711                table_variants = table
 1712            else:
 1713                table_variants = self.get_table_variants(clause="select")
 1714
 1715            # extra infos
 1716            try:
 1717                extra_infos = self.get_extra_infos()
 1718            except:
 1719                extra_infos = []
 1720
 1721            # Header infos
 1722            header_infos = self.get_header().infos
 1723
 1724            log.debug(
 1725                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1726            )
 1727
 1728            sql_info_alter_table_array = []
 1729
 1730            # Info fields to check
 1731            fields_list = list(header_infos)
 1732            if fields:
 1733                fields_list += fields
 1734            fields_list = set(fields_list)
 1735
 1736            # If no fields
 1737            if not fields:
 1738                fields = []
 1739
 1740            # Translate fields if patterns
 1741            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1742
 1743            for info in fields:
 1744
 1745                info_id_sql = prefix + info
 1746
 1747                if (
 1748                    info in fields_list
 1749                    or prefix + info in fields_list
 1750                    or info in extra_infos
 1751                ):
 1752
 1753                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1754
 1755                    if info in header_infos:
 1756                        info_type = header_infos[info].type
 1757                        info_num = header_infos[info].num
 1758                    else:
 1759                        info_type = "String"
 1760                        info_num = 0
 1761
 1762                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1763                    if info_num != 1:
 1764                        type_sql = "VARCHAR"
 1765
 1766                    # Add field
 1767                    added_column = self.add_column(
 1768                        table_name=table_variants,
 1769                        column_name=info_id_sql,
 1770                        column_type=type_sql,
 1771                        default_value="null",
 1772                        drop=force,
 1773                    )
 1774
 1775                    if added_column:
 1776                        added_columns.append(added_column)
 1777
 1778                    if added_column or force:
 1779
 1780                        # add field to index
 1781                        self.index_additionnal_fields.append(info_id_sql)
 1782
 1783                        # Update field array
 1784                        if connexion_format in ["duckdb"]:
 1785                            update_info_field = f"""
 1786                            "{info_id_sql}" =
 1787                                CASE
 1788                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1789                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1790                                END
 1791                            """
 1792                        elif connexion_format in ["sqlite"]:
 1793                            update_info_field = f"""
 1794                                "{info_id_sql}" =
 1795                                    CASE
 1796                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1797                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1798                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1799                                    END
 1800                            """
 1801
 1802                        sql_info_alter_table_array.append(update_info_field)
 1803
 1804            if sql_info_alter_table_array:
 1805
 1806                # By chromosomes
 1807                try:
 1808                    chromosomes_list = list(
 1809                        self.get_query_to_df(
 1810                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1811                        )["#CHROM"]
 1812                    )
 1813                except:
 1814                    chromosomes_list = [None]
 1815
 1816                for chrom in chromosomes_list:
 1817                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1818
 1819                    # Where clause
 1820                    where_clause = ""
 1821                    if chrom and len(chromosomes_list) > 1:
 1822                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1823
 1824                    # Update table
 1825                    if proccess_all_fields_together:
 1826                        sql_info_alter_table_array_join = ", ".join(
 1827                            sql_info_alter_table_array
 1828                        )
 1829                        if sql_info_alter_table_array_join:
 1830                            sql_info_alter_table = f"""
 1831                                UPDATE {table_variants}
 1832                                SET {sql_info_alter_table_array_join}
 1833                                {where_clause}
 1834                                """
 1835                            log.debug(
 1836                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1837                            )
 1838                            # log.debug(sql_info_alter_table)
 1839                            self.conn.execute(sql_info_alter_table)
 1840                    else:
 1841                        sql_info_alter_num = 0
 1842                        for sql_info_alter in sql_info_alter_table_array:
 1843                            sql_info_alter_num += 1
 1844                            sql_info_alter_table = f"""
 1845                                UPDATE {table_variants}
 1846                                SET {sql_info_alter}
 1847                                {where_clause}
 1848                                """
 1849                            log.debug(
 1850                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1851                            )
 1852                            # log.debug(sql_info_alter_table)
 1853                            self.conn.execute(sql_info_alter_table)
 1854
 1855        # create indexes
 1856        if create_index:
 1857            self.create_indexes()
 1858
 1859        return added_columns
 1860
 1861    def create_indexes(self) -> None:
 1862        """
 1863        Create indexes on the table after insertion
 1864        """
 1865
 1866        # Access
 1867        access = self.get_config().get("access", None)
 1868
 1869        # get table variants
 1870        table_variants = self.get_table_variants("FROM")
 1871
 1872        if self.get_indexing() and access not in ["RO"]:
 1873            # Create index
 1874            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 1875            self.conn.execute(sql_create_table_index)
 1876            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 1877            self.conn.execute(sql_create_table_index)
 1878            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 1879            self.conn.execute(sql_create_table_index)
 1880            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 1881            self.conn.execute(sql_create_table_index)
 1882            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 1883            self.conn.execute(sql_create_table_index)
 1884            for field in self.index_additionnal_fields:
 1885                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 1886                self.conn.execute(sql_create_table_index)
 1887
 1888    def drop_indexes(self) -> None:
 1889        """
 1890        Create indexes on the table after insertion
 1891        """
 1892
 1893        # Access
 1894        access = self.get_config().get("access", None)
 1895
 1896        # get table variants
 1897        table_variants = self.get_table_variants("FROM")
 1898
 1899        # Get database format
 1900        connexion_format = self.get_connexion_format()
 1901
 1902        if access not in ["RO"]:
 1903            if connexion_format in ["duckdb"]:
 1904                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 1905            elif connexion_format in ["sqlite"]:
 1906                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 1907
 1908            list_indexes = self.conn.execute(sql_list_indexes)
 1909            index_names = [row[0] for row in list_indexes.fetchall()]
 1910            for index in index_names:
 1911                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 1912                self.conn.execute(sql_drop_table_index)
 1913
 1914    def read_vcf_header(self, f) -> list:
 1915        """
 1916        It reads the header of a VCF file and returns a list of the header lines
 1917
 1918        :param f: the file object
 1919        :return: The header lines of the VCF file.
 1920        """
 1921
 1922        header_list = []
 1923        for line in f:
 1924            header_list.append(line)
 1925            if line.startswith("#CHROM"):
 1926                break
 1927        return header_list
 1928
 1929    def read_vcf_header_file(self, file: str = None) -> list:
 1930        """
 1931        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 1932        uncompressed files.
 1933
 1934        :param file: The `file` parameter is a string that represents the path to the VCF header file
 1935        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 1936        default to `None`
 1937        :type file: str
 1938        :return: The function `read_vcf_header_file` returns a list.
 1939        """
 1940
 1941        if self.get_input_compressed(input_file=file):
 1942            with bgzf.open(file, "rt") as f:
 1943                return self.read_vcf_header(f=f)
 1944        else:
 1945            with open(file, "rt") as f:
 1946                return self.read_vcf_header(f=f)
 1947
 1948    def execute_query(self, query: str):
 1949        """
 1950        It takes a query as an argument, executes it, and returns the results
 1951
 1952        :param query: The query to be executed
 1953        :return: The result of the query is being returned.
 1954        """
 1955        if query:
 1956            return self.conn.execute(query)  # .fetchall()
 1957        else:
 1958            return None
 1959
 1960    def export_output(
 1961        self,
 1962        output_file: str | None = None,
 1963        output_header: str | None = None,
 1964        export_header: bool = True,
 1965        query: str | None = None,
 1966        parquet_partitions: list | None = None,
 1967        chunk_size: int | None = None,
 1968        threads: int | None = None,
 1969        sort: bool = False,
 1970        index: bool = False,
 1971        order_by: str | None = None,
 1972    ) -> bool:
 1973        """
 1974        The `export_output` function exports data from a VCF file to a specified output file in various
 1975        formats, including VCF, CSV, TSV, PSV, and Parquet.
 1976
 1977        :param output_file: The `output_file` parameter is a string that specifies the name of the
 1978        output file to be generated by the function. This is where the exported data will be saved
 1979        :type output_file: str
 1980        :param output_header: The `output_header` parameter is a string that specifies the name of the
 1981        file where the header of the VCF file will be exported. If this parameter is not provided, the
 1982        header will be exported to a file with the same name as the `output_file` parameter, but with
 1983        the extension "
 1984        :type output_header: str
 1985        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 1986        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 1987        True, the header will be exported to a file. If `export_header` is False, the header will not
 1988        be, defaults to True, if output format is not VCF
 1989        :type export_header: bool (optional)
 1990        :param query: The `query` parameter is an optional SQL query that can be used to filter and
 1991        select specific data from the VCF file before exporting it. If provided, only the data that
 1992        matches the query will be exported
 1993        :type query: str
 1994        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 1995        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 1996        organize data in a hierarchical directory structure based on the values of one or more columns.
 1997        This can improve query performance when working with large datasets
 1998        :type parquet_partitions: list
 1999        :param chunk_size: The `chunk_size` parameter specifies the number of
 2000        records in batch when exporting data in Parquet format. This parameter is used for
 2001        partitioning the Parquet file into multiple files.
 2002        :type chunk_size: int
 2003        :param threads: The `threads` parameter is an optional parameter that specifies the number of
 2004        threads to be used during the export process. It determines the level of parallelism and can
 2005        improve the performance of the export operation. If not provided, the function will use the
 2006        default number of threads
 2007        :type threads: int
 2008        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
 2009        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
 2010        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
 2011        False
 2012        :type sort: bool (optional)
 2013        :param index: The `index` parameter is a boolean flag that determines whether an index should be
 2014        created on the output file. If `index` is True, an index will be created. If `index` is False,
 2015        no index will be created. The default value is False, defaults to False
 2016        :type index: bool (optional)
 2017        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
 2018        sorting the output file. This parameter is only applicable when exporting data in VCF format
 2019        :type order_by: str
 2020        :return: a boolean value. It checks if the output file exists and returns True if it does, or
 2021        None if it doesn't.
 2022        """
 2023
 2024        # Log
 2025        log.info("Exporting...")
 2026
 2027        # Full path
 2028        output_file = full_path(output_file)
 2029        output_header = full_path(output_header)
 2030
 2031        # Config
 2032        config = self.get_config()
 2033
 2034        # Param
 2035        param = self.get_param()
 2036
 2037        # Tmp files to remove
 2038        tmp_to_remove = []
 2039
 2040        # If no output, get it
 2041        if not output_file:
 2042            output_file = self.get_output()
 2043
 2044        # If not threads
 2045        if not threads:
 2046            threads = self.get_threads()
 2047
 2048        # Auto header name with extension
 2049        if export_header or output_header:
 2050            if not output_header:
 2051                output_header = f"{output_file}.hdr"
 2052            # Export header
 2053            self.export_header(output_file=output_file)
 2054
 2055        # Switch off export header if VCF output
 2056        output_file_type = get_file_format(output_file)
 2057        if output_file_type in ["vcf"]:
 2058            export_header = False
 2059            tmp_to_remove.append(output_header)
 2060
 2061        # Chunk size
 2062        if not chunk_size:
 2063            chunk_size = config.get("chunk_size", None)
 2064
 2065        # Parquet partition
 2066        if not parquet_partitions:
 2067            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2068        if parquet_partitions and isinstance(parquet_partitions, str):
 2069            parquet_partitions = parquet_partitions.split(",")
 2070
 2071        # Order by
 2072        if not order_by:
 2073            order_by = param.get("export", {}).get("order_by", "")
 2074
 2075        # Header in output
 2076        header_in_output = param.get("export", {}).get("include_header", False)
 2077
 2078        # Database
 2079        database_source = self.get_connexion()
 2080
 2081        # Connexion format
 2082        connexion_format = self.get_connexion_format()
 2083
 2084        # Explode infos
 2085        if self.get_explode_infos():
 2086            self.explode_infos(
 2087                prefix=self.get_explode_infos_prefix(),
 2088                fields=self.get_explode_infos_fields(),
 2089                force=False,
 2090            )
 2091
 2092        # if connexion_format in ["sqlite"] or query:
 2093        if connexion_format in ["sqlite"]:
 2094
 2095            # Export in Parquet
 2096            random_tmp = "".join(
 2097                random.choice(string.ascii_lowercase) for i in range(10)
 2098            )
 2099            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2100            tmp_to_remove.append(database_source)
 2101
 2102            # Table Variants
 2103            table_variants = self.get_table_variants()
 2104
 2105            # Create export query
 2106            sql_query_export_subquery = f"""
 2107                SELECT * FROM {table_variants}
 2108                """
 2109
 2110            # Write source file
 2111            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2112
 2113        # Create database
 2114        database = Database(
 2115            database=database_source,
 2116            table="variants",
 2117            header_file=output_header,
 2118            conn_config=self.get_connexion_config(),
 2119        )
 2120
 2121        # Existing colomns header
 2122        # existing_columns_header = database.get_header_file_columns(output_header)
 2123        existing_columns_header = database.get_header_columns_from_database()
 2124
 2125        # Export file
 2126        database.export(
 2127            output_database=output_file,
 2128            output_header=output_header,
 2129            existing_columns_header=existing_columns_header,
 2130            parquet_partitions=parquet_partitions,
 2131            chunk_size=chunk_size,
 2132            threads=threads,
 2133            sort=sort,
 2134            index=index,
 2135            header_in_output=header_in_output,
 2136            order_by=order_by,
 2137            query=query,
 2138            export_header=export_header,
 2139        )
 2140
 2141        # Remove
 2142        remove_if_exists(tmp_to_remove)
 2143
 2144        return (os.path.exists(output_file) or None) and (
 2145            os.path.exists(output_file) or None
 2146        )
 2147
 2148    def get_extra_infos(self, table: str = None) -> list:
 2149        """
 2150        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2151        in the header.
 2152
 2153        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2154        name of the table from which you want to retrieve the extra columns that are not present in the
 2155        header. If the `table` parameter is not provided when calling the function, it will default to
 2156        using the variants
 2157        :type table: str
 2158        :return: A list of columns that are in the specified table but not in the header of the table.
 2159        """
 2160
 2161        header_columns = []
 2162
 2163        if not table:
 2164            table = self.get_table_variants(clause="from")
 2165            header_columns = self.get_header_columns()
 2166
 2167        # Check all columns in the database
 2168        query = f""" SELECT * FROM {table} LIMIT 1 """
 2169        log.debug(f"query {query}")
 2170        table_columns = self.get_query_to_df(query).columns.tolist()
 2171        extra_columns = []
 2172
 2173        # Construct extra infos (not in header)
 2174        for column in table_columns:
 2175            if column not in header_columns:
 2176                extra_columns.append(column)
 2177
 2178        return extra_columns
 2179
 2180    def get_extra_infos_sql(self, table: str = None) -> str:
 2181        """
 2182        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2183        by double quotes
 2184
 2185        :param table: The name of the table to get the extra infos from. If None, the default table is
 2186        used
 2187        :type table: str
 2188        :return: A string of the extra infos
 2189        """
 2190
 2191        return ", ".join(
 2192            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2193        )
 2194
 2195    def export_header(
 2196        self,
 2197        header_name: str = None,
 2198        output_file: str = None,
 2199        output_file_ext: str = ".hdr",
 2200        clean_header: bool = True,
 2201        remove_chrom_line: bool = False,
 2202    ) -> str:
 2203        """
 2204        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2205        specified options, and writes it to a new file.
 2206
 2207        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2208        this parameter is not specified, the header will be written to the output file
 2209        :type header_name: str
 2210        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2211        specify the name of the output file where the header will be written. If this parameter is not
 2212        provided, the header will be written to a temporary file
 2213        :type output_file: str
 2214        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2215        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2216        if not specified by the user. This extension will be appended to the `output_file` name to
 2217        create the final, defaults to .hdr
 2218        :type output_file_ext: str (optional)
 2219        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2220        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2221        `True`, the function will clean the header by modifying certain lines based on a specific
 2222        pattern. If `clean_header`, defaults to True
 2223        :type clean_header: bool (optional)
 2224        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2225        boolean flag that determines whether the #CHROM line should be removed from the header before
 2226        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2227        defaults to False
 2228        :type remove_chrom_line: bool (optional)
 2229        :return: The function `export_header` returns the name of the temporary header file that is
 2230        created.
 2231        """
 2232
 2233        if not header_name and not output_file:
 2234            output_file = self.get_output()
 2235
 2236        if self.get_header():
 2237
 2238            # Get header object
 2239            header_obj = self.get_header()
 2240
 2241            # Create database
 2242            db_for_header = Database(database=self.get_input())
 2243
 2244            # Get real columns in the file
 2245            db_header_columns = db_for_header.get_columns()
 2246
 2247            with tempfile.TemporaryDirectory() as tmpdir:
 2248
 2249                # Write header file
 2250                header_file_tmp = os.path.join(tmpdir, "header")
 2251                f = open(header_file_tmp, "w")
 2252                vcf.Writer(f, header_obj)
 2253                f.close()
 2254
 2255                # Replace #CHROM line with rel columns
 2256                header_list = db_for_header.read_header_file(
 2257                    header_file=header_file_tmp
 2258                )
 2259                header_list[-1] = "\t".join(db_header_columns)
 2260
 2261                # Remove CHROM line
 2262                if remove_chrom_line:
 2263                    header_list.pop()
 2264
 2265                # Clean header
 2266                if clean_header:
 2267                    header_list_clean = []
 2268                    for head in header_list:
 2269                        # Clean head for malformed header
 2270                        head_clean = head
 2271                        head_clean = re.subn(
 2272                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2273                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2274                            head_clean,
 2275                            2,
 2276                        )[0]
 2277                        # Write header
 2278                        header_list_clean.append(head_clean)
 2279                    header_list = header_list_clean
 2280
 2281            tmp_header_name = output_file + output_file_ext
 2282
 2283            f = open(tmp_header_name, "w")
 2284            for line in header_list:
 2285                f.write(line)
 2286            f.close()
 2287
 2288        return tmp_header_name
 2289
 2290    def export_variant_vcf(
 2291        self,
 2292        vcf_file,
 2293        remove_info: bool = False,
 2294        add_samples: bool = True,
 2295        list_samples: list = [],
 2296        where_clause: str = "",
 2297        index: bool = False,
 2298        threads: int | None = None,
 2299    ) -> bool | None:
 2300        """
 2301        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2302        remove INFO field, add samples, and control compression and indexing.
 2303
 2304        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2305        written to. It is the output file that will contain the filtered VCF data based on the specified
 2306        parameters
 2307        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2308        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2309        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2310        in, defaults to False
 2311        :type remove_info: bool (optional)
 2312        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2313        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2314        If set to False, the samples will be removed. The default value is True, defaults to True
 2315        :type add_samples: bool (optional)
 2316        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2317        in the output VCF file. By default, all samples will be included. If you provide a list of
 2318        samples, only those samples will be included in the output file
 2319        :type list_samples: list
 2320        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2321        determines whether or not to create an index for the output VCF file. If `index` is set to
 2322        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2323        :type index: bool (optional)
 2324        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2325        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2326        will be used during the export process. More threads can potentially speed up the export process
 2327        by utilizing multiple cores of the processor. If
 2328        :type threads: int | None
 2329        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2330        method with various parameters including the output file, query, threads, sort flag, and index
 2331        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2332        specified parameters and configurations provided in the `export_variant_vcf` function.
 2333        """
 2334
 2335        # Config
 2336        config = self.get_config()
 2337
 2338        # Extract VCF
 2339        log.debug("Export VCF...")
 2340
 2341        # Table variants
 2342        table_variants = self.get_table_variants()
 2343
 2344        # Threads
 2345        if not threads:
 2346            threads = self.get_threads()
 2347
 2348        # Info fields
 2349        if remove_info:
 2350            if not isinstance(remove_info, str):
 2351                remove_info = "."
 2352            info_field = f"""'{remove_info}' as INFO"""
 2353        else:
 2354            info_field = "INFO"
 2355
 2356        # Samples fields
 2357        if add_samples:
 2358            if not list_samples:
 2359                list_samples = self.get_header_sample_list()
 2360            if list_samples:
 2361                samples_fields = " , FORMAT , " + " , ".join(list_samples)
 2362            else:
 2363                samples_fields = ""
 2364            log.debug(f"samples_fields: {samples_fields}")
 2365        else:
 2366            samples_fields = ""
 2367
 2368        # Where clause
 2369        if where_clause is None:
 2370            where_clause = ""
 2371
 2372        # Variants
 2373        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2374        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2375        log.debug(f"sql_query_select={sql_query_select}")
 2376
 2377        return self.export_output(
 2378            output_file=vcf_file,
 2379            output_header=None,
 2380            export_header=True,
 2381            query=sql_query_select,
 2382            parquet_partitions=None,
 2383            chunk_size=config.get("chunk_size", None),
 2384            threads=threads,
 2385            sort=True,
 2386            index=index,
 2387            order_by=None,
 2388        )
 2389
 2390    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2391        """
 2392        It takes a list of commands and runs them in parallel using the number of threads specified
 2393
 2394        :param commands: A list of commands to run
 2395        :param threads: The number of threads to use, defaults to 1 (optional)
 2396        """
 2397
 2398        run_parallel_commands(commands, threads)
 2399
 2400    def get_threads(self, default: int = 1) -> int:
 2401        """
 2402        This function returns the number of threads to use for a job, with a default value of 1 if not
 2403        specified.
 2404
 2405        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2406        default number of threads to use if no specific value is provided. If no value is provided for
 2407        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2408        used, defaults to 1
 2409        :type default: int (optional)
 2410        :return: the number of threads to use for the current job.
 2411        """
 2412
 2413        # Config
 2414        config = self.get_config()
 2415
 2416        # Param
 2417        param = self.get_param()
 2418
 2419        # Input threads
 2420        input_thread = param.get("threads", config.get("threads", None))
 2421
 2422        # Check threads
 2423        if not input_thread:
 2424            threads = default
 2425        elif int(input_thread) <= 0:
 2426            threads = os.cpu_count()
 2427        else:
 2428            threads = int(input_thread)
 2429        return threads
 2430
 2431    def get_memory(self, default: str = None) -> str:
 2432        """
 2433        This function retrieves the memory value from parameters or configuration with a default value
 2434        if not found.
 2435
 2436        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2437        default value is used as a fallback in case the `memory` parameter is not provided in the
 2438        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2439        the function
 2440        :type default: str
 2441        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2442        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2443        return the default value provided as an argument to the function.
 2444        """
 2445
 2446        # Config
 2447        config = self.get_config()
 2448
 2449        # Param
 2450        param = self.get_param()
 2451
 2452        # Input threads
 2453        input_memory = param.get("memory", config.get("memory", None))
 2454
 2455        # Check threads
 2456        if input_memory:
 2457            memory = input_memory
 2458        else:
 2459            memory = default
 2460
 2461        return memory
 2462
 2463    def update_from_vcf(self, vcf_file: str) -> None:
 2464        """
 2465        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2466
 2467        :param vcf_file: the path to the VCF file
 2468        """
 2469
 2470        connexion_format = self.get_connexion_format()
 2471
 2472        if connexion_format in ["duckdb"]:
 2473            self.update_from_vcf_duckdb(vcf_file)
 2474        elif connexion_format in ["sqlite"]:
 2475            self.update_from_vcf_sqlite(vcf_file)
 2476
 2477    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2478        """
 2479        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2480        INFO column of the VCF file
 2481
 2482        :param vcf_file: the path to the VCF file
 2483        """
 2484
 2485        # varaints table
 2486        table_variants = self.get_table_variants()
 2487
 2488        # Loading VCF into temporaire table
 2489        skip = self.get_header_length(file=vcf_file)
 2490        vcf_df = pd.read_csv(
 2491            vcf_file,
 2492            sep="\t",
 2493            engine="c",
 2494            skiprows=skip,
 2495            header=0,
 2496            low_memory=False,
 2497        )
 2498        sql_query_update = f"""
 2499        UPDATE {table_variants} as table_variants
 2500            SET INFO = concat(
 2501                            CASE
 2502                                WHEN INFO NOT IN ('', '.')
 2503                                THEN INFO
 2504                                ELSE ''
 2505                            END,
 2506                            (
 2507                                SELECT 
 2508                                    concat(
 2509                                        CASE
 2510                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2511                                            THEN ';'
 2512                                            ELSE ''
 2513                                        END
 2514                                        ,
 2515                                        CASE
 2516                                            WHEN table_parquet.INFO NOT IN ('','.')
 2517                                            THEN table_parquet.INFO
 2518                                            ELSE ''
 2519                                        END
 2520                                    )
 2521                                FROM vcf_df as table_parquet
 2522                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2523                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2524                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2525                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2526                                        AND table_parquet.INFO NOT IN ('','.')
 2527                            )
 2528                        )
 2529            ;
 2530            """
 2531        self.conn.execute(sql_query_update)
 2532
 2533    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2534        """
 2535        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2536        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2537        table
 2538
 2539        :param vcf_file: The path to the VCF file you want to update the database with
 2540        """
 2541
 2542        # Create a temporary table for the VCF
 2543        table_vcf = "tmp_vcf"
 2544        sql_create = (
 2545            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2546        )
 2547        self.conn.execute(sql_create)
 2548
 2549        # Loading VCF into temporaire table
 2550        vcf_df = pd.read_csv(
 2551            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2552        )
 2553        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2554        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2555
 2556        # Update table 'variants' with VCF data
 2557        # warning: CONCAT as || operator
 2558        sql_query_update = f"""
 2559            UPDATE variants as table_variants
 2560            SET INFO = CASE
 2561                            WHEN INFO NOT IN ('', '.')
 2562                            THEN INFO
 2563                            ELSE ''
 2564                        END ||
 2565                        (
 2566                        SELECT 
 2567                            CASE 
 2568                                WHEN table_variants.INFO NOT IN ('','.') 
 2569                                    AND table_vcf.INFO NOT IN ('','.')  
 2570                                THEN ';' 
 2571                                ELSE '' 
 2572                            END || 
 2573                            CASE 
 2574                                WHEN table_vcf.INFO NOT IN ('','.') 
 2575                                THEN table_vcf.INFO 
 2576                                ELSE '' 
 2577                            END
 2578                        FROM {table_vcf} as table_vcf
 2579                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2580                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2581                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2582                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2583                        )
 2584        """
 2585        self.conn.execute(sql_query_update)
 2586
 2587        # Drop temporary table
 2588        sql_drop = f"DROP TABLE {table_vcf}"
 2589        self.conn.execute(sql_drop)
 2590
 2591    def drop_variants_table(self) -> None:
 2592        """
 2593        > This function drops the variants table
 2594        """
 2595
 2596        table_variants = self.get_table_variants()
 2597        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2598        self.conn.execute(sql_table_variants)
 2599
 2600    def set_variant_id(
 2601        self, variant_id_column: str = "variant_id", force: bool = None
 2602    ) -> str:
 2603        """
 2604        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2605        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2606
 2607        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2608        to variant_id
 2609        :type variant_id_column: str (optional)
 2610        :param force: If True, the variant_id column will be created even if it already exists
 2611        :type force: bool
 2612        :return: The name of the column that contains the variant_id
 2613        """
 2614
 2615        # Assembly
 2616        assembly = self.get_param().get(
 2617            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2618        )
 2619
 2620        # INFO/Tag prefix
 2621        prefix = self.get_explode_infos_prefix()
 2622
 2623        # Explode INFO/SVTYPE
 2624        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2625
 2626        # variants table
 2627        table_variants = self.get_table_variants()
 2628
 2629        # variant_id column
 2630        if not variant_id_column:
 2631            variant_id_column = "variant_id"
 2632
 2633        # Creta variant_id column
 2634        if "variant_id" not in self.get_extra_infos() or force:
 2635
 2636            # Create column
 2637            self.add_column(
 2638                table_name=table_variants,
 2639                column_name=variant_id_column,
 2640                column_type="UBIGINT",
 2641                default_value="0",
 2642            )
 2643
 2644            # Update column
 2645            self.conn.execute(
 2646                f"""
 2647                    UPDATE {table_variants}
 2648                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2649                """
 2650            )
 2651
 2652        # Remove added columns
 2653        for added_column in added_columns:
 2654            self.drop_column(column=added_column)
 2655
 2656        # return variant_id column name
 2657        return variant_id_column
 2658
 2659    def get_variant_id_column(
 2660        self, variant_id_column: str = "variant_id", force: bool = None
 2661    ) -> str:
 2662        """
 2663        This function returns the variant_id column name
 2664
 2665        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2666        defaults to variant_id
 2667        :type variant_id_column: str (optional)
 2668        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2669        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2670        if it is not already set, or if it is set
 2671        :type force: bool
 2672        :return: The variant_id column name.
 2673        """
 2674
 2675        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2676
 2677    ###
 2678    # Annotation
 2679    ###
 2680
 2681    def scan_databases(
 2682        self,
 2683        database_formats: list = ["parquet"],
 2684        database_releases: list = ["current"],
 2685    ) -> dict:
 2686        """
 2687        The function `scan_databases` scans for available databases based on specified formats and
 2688        releases.
 2689
 2690        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2691        of the databases to be scanned. In this case, the accepted format is "parquet"
 2692        :type database_formats: list ["parquet"]
 2693        :param database_releases: The `database_releases` parameter is a list that specifies the
 2694        releases of the databases to be scanned. In the provided function, the default value for
 2695        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2696        databases that are in the "current"
 2697        :type database_releases: list
 2698        :return: The function `scan_databases` returns a dictionary containing information about
 2699        databases that match the specified formats and releases.
 2700        """
 2701
 2702        # Config
 2703        config = self.get_config()
 2704
 2705        # Param
 2706        param = self.get_param()
 2707
 2708        # Param - Assembly
 2709        assembly = param.get("assembly", config.get("assembly", None))
 2710        if not assembly:
 2711            assembly = DEFAULT_ASSEMBLY
 2712            log.warning(f"Default assembly '{assembly}'")
 2713
 2714        # Scan for availabled databases
 2715        log.info(
 2716            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2717        )
 2718        databases_infos_dict = databases_infos(
 2719            database_folder_releases=database_releases,
 2720            database_formats=database_formats,
 2721            assembly=assembly,
 2722            config=config,
 2723        )
 2724        log.info(
 2725            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2726        )
 2727
 2728        return databases_infos_dict
 2729
 2730    def annotation(self) -> None:
 2731        """
 2732        It annotates the VCF file with the annotations specified in the config file.
 2733        """
 2734
 2735        # Config
 2736        config = self.get_config()
 2737
 2738        # Param
 2739        param = self.get_param()
 2740
 2741        # Param - Assembly
 2742        assembly = param.get("assembly", config.get("assembly", None))
 2743        if not assembly:
 2744            assembly = DEFAULT_ASSEMBLY
 2745            log.warning(f"Default assembly '{assembly}'")
 2746
 2747        # annotations databases folders
 2748        annotations_databases = set(
 2749            config.get("folders", {})
 2750            .get("databases", {})
 2751            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2752            + config.get("folders", {})
 2753            .get("databases", {})
 2754            .get("parquet", ["~/howard/databases/parquet/current"])
 2755            + config.get("folders", {})
 2756            .get("databases", {})
 2757            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2758        )
 2759
 2760        # Get param annotations
 2761        if param.get("annotations", None) and isinstance(
 2762            param.get("annotations", None), str
 2763        ):
 2764            log.debug(param.get("annotations", None))
 2765            param_annotation_list = param.get("annotations").split(",")
 2766        else:
 2767            param_annotation_list = []
 2768
 2769        # Each tools param
 2770        if param.get("annotation_parquet", None) != None:
 2771            log.debug(
 2772                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2773            )
 2774            if isinstance(param.get("annotation_parquet", None), list):
 2775                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2776            else:
 2777                param_annotation_list.append(param.get("annotation_parquet"))
 2778        if param.get("annotation_snpsift", None) != None:
 2779            if isinstance(param.get("annotation_snpsift", None), list):
 2780                param_annotation_list.append(
 2781                    "snpsift:"
 2782                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2783                )
 2784            else:
 2785                param_annotation_list.append(
 2786                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2787                )
 2788        if param.get("annotation_snpeff", None) != None:
 2789            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2790        if param.get("annotation_bcftools", None) != None:
 2791            if isinstance(param.get("annotation_bcftools", None), list):
 2792                param_annotation_list.append(
 2793                    "bcftools:"
 2794                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2795                )
 2796            else:
 2797                param_annotation_list.append(
 2798                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2799                )
 2800        if param.get("annotation_annovar", None) != None:
 2801            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2802        if param.get("annotation_exomiser", None) != None:
 2803            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2804        if param.get("annotation_splice", None) != None:
 2805            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2806
 2807        # Merge param annotations list
 2808        param["annotations"] = ",".join(param_annotation_list)
 2809
 2810        # debug
 2811        log.debug(f"param_annotations={param['annotations']}")
 2812
 2813        if param.get("annotations"):
 2814
 2815            # Log
 2816            # log.info("Annotations - Check annotation parameters")
 2817
 2818            if not "annotation" in param:
 2819                param["annotation"] = {}
 2820
 2821            # List of annotations parameters
 2822            annotations_list_input = {}
 2823            if isinstance(param.get("annotations", None), str):
 2824                annotation_file_list = [
 2825                    value for value in param.get("annotations", "").split(",")
 2826                ]
 2827                for annotation_file in annotation_file_list:
 2828                    annotations_list_input[annotation_file] = {"INFO": None}
 2829            else:
 2830                annotations_list_input = param.get("annotations", {})
 2831
 2832            log.info(f"Quick Annotations:")
 2833            for annotation_key in list(annotations_list_input.keys()):
 2834                log.info(f"   {annotation_key}")
 2835
 2836            # List of annotations and associated fields
 2837            annotations_list = {}
 2838
 2839            for annotation_file in annotations_list_input:
 2840
 2841                # Explode annotations if ALL
 2842                if (
 2843                    annotation_file.upper() == "ALL"
 2844                    or annotation_file.upper().startswith("ALL:")
 2845                ):
 2846
 2847                    # check ALL parameters (formats, releases)
 2848                    annotation_file_split = annotation_file.split(":")
 2849                    database_formats = "parquet"
 2850                    database_releases = "current"
 2851                    for annotation_file_option in annotation_file_split[1:]:
 2852                        database_all_options_split = annotation_file_option.split("=")
 2853                        if database_all_options_split[0] == "format":
 2854                            database_formats = database_all_options_split[1].split("+")
 2855                        if database_all_options_split[0] == "release":
 2856                            database_releases = database_all_options_split[1].split("+")
 2857
 2858                    # Scan for availabled databases
 2859                    databases_infos_dict = self.scan_databases(
 2860                        database_formats=database_formats,
 2861                        database_releases=database_releases,
 2862                    )
 2863
 2864                    # Add found databases in annotation parameters
 2865                    for database_infos in databases_infos_dict.keys():
 2866                        annotations_list[database_infos] = {"INFO": None}
 2867
 2868                else:
 2869                    annotations_list[annotation_file] = annotations_list_input[
 2870                        annotation_file
 2871                    ]
 2872
 2873            # Check each databases
 2874            if len(annotations_list):
 2875
 2876                log.info(
 2877                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 2878                )
 2879
 2880                for annotation_file in annotations_list:
 2881
 2882                    # Init
 2883                    annotations = annotations_list.get(annotation_file, None)
 2884
 2885                    # Annotation snpEff
 2886                    if annotation_file.startswith("snpeff"):
 2887
 2888                        log.debug(f"Quick Annotation snpEff")
 2889
 2890                        if "snpeff" not in param["annotation"]:
 2891                            param["annotation"]["snpeff"] = {}
 2892
 2893                        if "options" not in param["annotation"]["snpeff"]:
 2894                            param["annotation"]["snpeff"]["options"] = ""
 2895
 2896                        # snpEff options in annotations
 2897                        param["annotation"]["snpeff"]["options"] = "".join(
 2898                            annotation_file.split(":")[1:]
 2899                        )
 2900
 2901                    # Annotation Annovar
 2902                    elif annotation_file.startswith("annovar"):
 2903
 2904                        log.debug(f"Quick Annotation Annovar")
 2905
 2906                        if "annovar" not in param["annotation"]:
 2907                            param["annotation"]["annovar"] = {}
 2908
 2909                        if "annotations" not in param["annotation"]["annovar"]:
 2910                            param["annotation"]["annovar"]["annotations"] = {}
 2911
 2912                        # Options
 2913                        annotation_file_split = annotation_file.split(":")
 2914                        for annotation_file_annotation in annotation_file_split[1:]:
 2915                            if annotation_file_annotation:
 2916                                param["annotation"]["annovar"]["annotations"][
 2917                                    annotation_file_annotation
 2918                                ] = annotations
 2919
 2920                    # Annotation Exomiser
 2921                    elif annotation_file.startswith("exomiser"):
 2922
 2923                        log.debug(f"Quick Annotation Exomiser")
 2924
 2925                        param["annotation"]["exomiser"] = params_string_to_dict(
 2926                            annotation_file
 2927                        )
 2928
 2929                    # Annotation Splice
 2930                    elif annotation_file.startswith("splice"):
 2931
 2932                        log.debug(f"Quick Annotation Splice")
 2933
 2934                        param["annotation"]["splice"] = params_string_to_dict(
 2935                            annotation_file
 2936                        )
 2937
 2938                    # Annotation Parquet or BCFTOOLS
 2939                    else:
 2940
 2941                        # Tools detection
 2942                        if annotation_file.startswith("bcftools:"):
 2943                            annotation_tool_initial = "bcftools"
 2944                            annotation_file = ":".join(annotation_file.split(":")[1:])
 2945                        elif annotation_file.startswith("snpsift:"):
 2946                            annotation_tool_initial = "snpsift"
 2947                            annotation_file = ":".join(annotation_file.split(":")[1:])
 2948                        else:
 2949                            annotation_tool_initial = None
 2950
 2951                        # list of files
 2952                        annotation_file_list = annotation_file.replace("+", ":").split(
 2953                            ":"
 2954                        )
 2955
 2956                        for annotation_file in annotation_file_list:
 2957
 2958                            if annotation_file:
 2959
 2960                                # Annotation tool initial
 2961                                annotation_tool = annotation_tool_initial
 2962
 2963                                # Find file
 2964                                annotation_file_found = None
 2965
 2966                                # Expand user
 2967                                annotation_file = full_path(annotation_file)
 2968
 2969                                if os.path.exists(annotation_file):
 2970                                    annotation_file_found = annotation_file
 2971
 2972                                else:
 2973                                    # Find within assembly folders
 2974                                    for annotations_database in annotations_databases:
 2975                                        found_files = find_all(
 2976                                            annotation_file,
 2977                                            os.path.join(
 2978                                                annotations_database, assembly
 2979                                            ),
 2980                                        )
 2981                                        if len(found_files) > 0:
 2982                                            annotation_file_found = found_files[0]
 2983                                            break
 2984                                    if not annotation_file_found and not assembly:
 2985                                        # Find within folders
 2986                                        for (
 2987                                            annotations_database
 2988                                        ) in annotations_databases:
 2989                                            found_files = find_all(
 2990                                                annotation_file, annotations_database
 2991                                            )
 2992                                            if len(found_files) > 0:
 2993                                                annotation_file_found = found_files[0]
 2994                                                break
 2995                                log.debug(
 2996                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 2997                                )
 2998
 2999                                # Full path
 3000                                annotation_file_found = full_path(annotation_file_found)
 3001
 3002                                if annotation_file_found:
 3003
 3004                                    database = Database(database=annotation_file_found)
 3005                                    quick_annotation_format = database.get_format()
 3006                                    quick_annotation_is_compressed = (
 3007                                        database.is_compressed()
 3008                                    )
 3009                                    quick_annotation_is_indexed = os.path.exists(
 3010                                        f"{annotation_file_found}.tbi"
 3011                                    )
 3012                                    bcftools_preference = False
 3013
 3014                                    # Check Annotation Tool
 3015                                    if not annotation_tool:
 3016                                        if (
 3017                                            bcftools_preference
 3018                                            and quick_annotation_format
 3019                                            in ["vcf", "bed"]
 3020                                            and quick_annotation_is_compressed
 3021                                            and quick_annotation_is_indexed
 3022                                        ):
 3023                                            annotation_tool = "bcftools"
 3024                                        elif quick_annotation_format in [
 3025                                            "vcf",
 3026                                            "bed",
 3027                                            "tsv",
 3028                                            "tsv",
 3029                                            "csv",
 3030                                            "json",
 3031                                            "tbl",
 3032                                            "parquet",
 3033                                            "duckdb",
 3034                                        ]:
 3035                                            annotation_tool = "parquet"
 3036                                        else:
 3037                                            log.error(
 3038                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3039                                            )
 3040                                            raise ValueError(
 3041                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3042                                            )
 3043
 3044                                    log.debug(
 3045                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3046                                    )
 3047
 3048                                    # Annotation Tool dispatch
 3049                                    if annotation_tool:
 3050                                        if annotation_tool not in param["annotation"]:
 3051                                            param["annotation"][annotation_tool] = {}
 3052                                        if (
 3053                                            "annotations"
 3054                                            not in param["annotation"][annotation_tool]
 3055                                        ):
 3056                                            param["annotation"][annotation_tool][
 3057                                                "annotations"
 3058                                            ] = {}
 3059                                        param["annotation"][annotation_tool][
 3060                                            "annotations"
 3061                                        ][annotation_file_found] = annotations
 3062
 3063                                else:
 3064                                    log.error(
 3065                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3066                                    )
 3067
 3068                self.set_param(param)
 3069
 3070        if param.get("annotation", None):
 3071            log.info("Annotations")
 3072            if param.get("annotation", {}).get("parquet", None):
 3073                log.info("Annotations 'parquet'...")
 3074                self.annotation_parquet()
 3075            if param.get("annotation", {}).get("bcftools", None):
 3076                log.info("Annotations 'bcftools'...")
 3077                self.annotation_bcftools()
 3078            if param.get("annotation", {}).get("snpsift", None):
 3079                log.info("Annotations 'snpsift'...")
 3080                self.annotation_snpsift()
 3081            if param.get("annotation", {}).get("annovar", None):
 3082                log.info("Annotations 'annovar'...")
 3083                self.annotation_annovar()
 3084            if param.get("annotation", {}).get("snpeff", None):
 3085                log.info("Annotations 'snpeff'...")
 3086                self.annotation_snpeff()
 3087            if param.get("annotation", {}).get("exomiser", None) is not None:
 3088                log.info("Annotations 'exomiser'...")
 3089                self.annotation_exomiser()
 3090            if param.get("annotation", {}).get("splice", None) is not None:
 3091                log.info("Annotations 'splice' ...")
 3092                self.annotation_splice()
 3093
 3094        # Explode INFOS fields into table fields
 3095        if self.get_explode_infos():
 3096            self.explode_infos(
 3097                prefix=self.get_explode_infos_prefix(),
 3098                fields=self.get_explode_infos_fields(),
 3099                force=True,
 3100            )
 3101
 3102    def annotation_snpsift(self, threads: int = None) -> None:
 3103        """
 3104        This function annotate with bcftools
 3105
 3106        :param threads: Number of threads to use
 3107        :return: the value of the variable "return_value".
 3108        """
 3109
 3110        # DEBUG
 3111        log.debug("Start annotation with bcftools databases")
 3112
 3113        # Threads
 3114        if not threads:
 3115            threads = self.get_threads()
 3116        log.debug("Threads: " + str(threads))
 3117
 3118        # Config
 3119        config = self.get_config()
 3120        log.debug("Config: " + str(config))
 3121
 3122        # Config - snpSift
 3123        snpsift_bin_command = get_bin_command(
 3124            bin="SnpSift.jar",
 3125            tool="snpsift",
 3126            bin_type="jar",
 3127            config=config,
 3128            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3129        )
 3130        if not snpsift_bin_command:
 3131            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3132            log.error(msg_err)
 3133            raise ValueError(msg_err)
 3134
 3135        # Config - bcftools
 3136        bcftools_bin_command = get_bin_command(
 3137            bin="bcftools",
 3138            tool="bcftools",
 3139            bin_type="bin",
 3140            config=config,
 3141            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3142        )
 3143        if not bcftools_bin_command:
 3144            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3145            log.error(msg_err)
 3146            raise ValueError(msg_err)
 3147
 3148        # Config - BCFTools databases folders
 3149        databases_folders = set(
 3150            self.get_config()
 3151            .get("folders", {})
 3152            .get("databases", {})
 3153            .get("annotations", ["."])
 3154            + self.get_config()
 3155            .get("folders", {})
 3156            .get("databases", {})
 3157            .get("bcftools", ["."])
 3158        )
 3159        log.debug("Databases annotations: " + str(databases_folders))
 3160
 3161        # Param
 3162        annotations = (
 3163            self.get_param()
 3164            .get("annotation", {})
 3165            .get("snpsift", {})
 3166            .get("annotations", None)
 3167        )
 3168        log.debug("Annotations: " + str(annotations))
 3169
 3170        # Assembly
 3171        assembly = self.get_param().get(
 3172            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3173        )
 3174
 3175        # Data
 3176        table_variants = self.get_table_variants()
 3177
 3178        # Check if not empty
 3179        log.debug("Check if not empty")
 3180        sql_query_chromosomes = (
 3181            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3182        )
 3183        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3184        if not sql_query_chromosomes_df["count"][0]:
 3185            log.info(f"VCF empty")
 3186            return
 3187
 3188        # VCF header
 3189        vcf_reader = self.get_header()
 3190        log.debug("Initial header: " + str(vcf_reader.infos))
 3191
 3192        # Existing annotations
 3193        for vcf_annotation in self.get_header().infos:
 3194
 3195            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3196            log.debug(
 3197                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3198            )
 3199
 3200        if annotations:
 3201
 3202            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3203
 3204                # Export VCF file
 3205                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3206
 3207                # Init
 3208                commands = {}
 3209
 3210                for annotation in annotations:
 3211                    annotation_fields = annotations[annotation]
 3212
 3213                    # Annotation Name
 3214                    annotation_name = os.path.basename(annotation)
 3215
 3216                    if not annotation_fields:
 3217                        annotation_fields = {"INFO": None}
 3218
 3219                    log.debug(f"Annotation '{annotation_name}'")
 3220                    log.debug(
 3221                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3222                    )
 3223
 3224                    # Create Database
 3225                    database = Database(
 3226                        database=annotation,
 3227                        databases_folders=databases_folders,
 3228                        assembly=assembly,
 3229                    )
 3230
 3231                    # Find files
 3232                    db_file = database.get_database()
 3233                    db_file = full_path(db_file)
 3234                    db_hdr_file = database.get_header_file()
 3235                    db_hdr_file = full_path(db_hdr_file)
 3236                    db_file_type = database.get_format()
 3237                    db_tbi_file = f"{db_file}.tbi"
 3238                    db_file_compressed = database.is_compressed()
 3239
 3240                    # Check if compressed
 3241                    if not db_file_compressed:
 3242                        log.error(
 3243                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3244                        )
 3245                        raise ValueError(
 3246                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3247                        )
 3248
 3249                    # Check if indexed
 3250                    if not os.path.exists(db_tbi_file):
 3251                        log.error(
 3252                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3253                        )
 3254                        raise ValueError(
 3255                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3256                        )
 3257
 3258                    # Check index - try to create if not exists
 3259                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3260                        log.error("Annotation failed: database not valid")
 3261                        log.error(f"Annotation annotation file: {db_file}")
 3262                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3263                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3264                        raise ValueError(
 3265                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3266                        )
 3267                    else:
 3268
 3269                        log.debug(
 3270                            f"Annotation '{annotation}' - file: "
 3271                            + str(db_file)
 3272                            + " and "
 3273                            + str(db_hdr_file)
 3274                        )
 3275
 3276                        # Load header as VCF object
 3277                        db_hdr_vcf = Variants(input=db_hdr_file)
 3278                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3279                        log.debug(
 3280                            "Annotation database header: "
 3281                            + str(db_hdr_vcf_header_infos)
 3282                        )
 3283
 3284                        # For all fields in database
 3285                        annotation_fields_full = False
 3286                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3287                            annotation_fields = {
 3288                                key: key for key in db_hdr_vcf_header_infos
 3289                            }
 3290                            log.debug(
 3291                                "Annotation database header - All annotations added: "
 3292                                + str(annotation_fields)
 3293                            )
 3294                            annotation_fields_full = True
 3295
 3296                        # # Create file for field rename
 3297                        # log.debug("Create file for field rename")
 3298                        # tmp_rename = NamedTemporaryFile(
 3299                        #     prefix=self.get_prefix(),
 3300                        #     dir=self.get_tmp_dir(),
 3301                        #     suffix=".rename",
 3302                        #     delete=False,
 3303                        # )
 3304                        # tmp_rename_name = tmp_rename.name
 3305                        # tmp_files.append(tmp_rename_name)
 3306
 3307                        # Number of fields
 3308                        nb_annotation_field = 0
 3309                        annotation_list = []
 3310                        annotation_infos_rename_list = []
 3311
 3312                        for annotation_field in annotation_fields:
 3313
 3314                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3315                            annotation_fields_new_name = annotation_fields.get(
 3316                                annotation_field, annotation_field
 3317                            )
 3318                            if not annotation_fields_new_name:
 3319                                annotation_fields_new_name = annotation_field
 3320
 3321                            # Check if field is in DB and if field is not elready in input data
 3322                            if (
 3323                                annotation_field in db_hdr_vcf.get_header().infos
 3324                                and annotation_fields_new_name
 3325                                not in self.get_header().infos
 3326                            ):
 3327
 3328                                log.info(
 3329                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3330                                )
 3331
 3332                                # BCFTools annotate param to rename fields
 3333                                if annotation_field != annotation_fields_new_name:
 3334                                    annotation_infos_rename_list.append(
 3335                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3336                                    )
 3337
 3338                                # Add INFO field to header
 3339                                db_hdr_vcf_header_infos_number = (
 3340                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3341                                )
 3342                                db_hdr_vcf_header_infos_type = (
 3343                                    db_hdr_vcf_header_infos[annotation_field].type
 3344                                    or "String"
 3345                                )
 3346                                db_hdr_vcf_header_infos_description = (
 3347                                    db_hdr_vcf_header_infos[annotation_field].desc
 3348                                    or f"{annotation_field} description"
 3349                                )
 3350                                db_hdr_vcf_header_infos_source = (
 3351                                    db_hdr_vcf_header_infos[annotation_field].source
 3352                                    or "unknown"
 3353                                )
 3354                                db_hdr_vcf_header_infos_version = (
 3355                                    db_hdr_vcf_header_infos[annotation_field].version
 3356                                    or "unknown"
 3357                                )
 3358
 3359                                vcf_reader.infos[annotation_fields_new_name] = (
 3360                                    vcf.parser._Info(
 3361                                        annotation_fields_new_name,
 3362                                        db_hdr_vcf_header_infos_number,
 3363                                        db_hdr_vcf_header_infos_type,
 3364                                        db_hdr_vcf_header_infos_description,
 3365                                        db_hdr_vcf_header_infos_source,
 3366                                        db_hdr_vcf_header_infos_version,
 3367                                        self.code_type_map[
 3368                                            db_hdr_vcf_header_infos_type
 3369                                        ],
 3370                                    )
 3371                                )
 3372
 3373                                annotation_list.append(annotation_field)
 3374
 3375                                nb_annotation_field += 1
 3376
 3377                            else:
 3378
 3379                                if (
 3380                                    annotation_field
 3381                                    not in db_hdr_vcf.get_header().infos
 3382                                ):
 3383                                    log.warning(
 3384                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3385                                    )
 3386                                if (
 3387                                    annotation_fields_new_name
 3388                                    in self.get_header().infos
 3389                                ):
 3390                                    log.warning(
 3391                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3392                                    )
 3393
 3394                        log.info(
 3395                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3396                        )
 3397
 3398                        annotation_infos = ",".join(annotation_list)
 3399
 3400                        if annotation_infos != "":
 3401
 3402                            # Annotated VCF (and error file)
 3403                            tmp_annotation_vcf_name = os.path.join(
 3404                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3405                            )
 3406                            tmp_annotation_vcf_name_err = (
 3407                                tmp_annotation_vcf_name + ".err"
 3408                            )
 3409
 3410                            # Add fields to annotate
 3411                            if not annotation_fields_full:
 3412                                annotation_infos_option = f"-info {annotation_infos}"
 3413                            else:
 3414                                annotation_infos_option = ""
 3415
 3416                            # Info fields rename
 3417                            if annotation_infos_rename_list:
 3418                                annotation_infos_rename = " -c " + ",".join(
 3419                                    annotation_infos_rename_list
 3420                                )
 3421                            else:
 3422                                annotation_infos_rename = ""
 3423
 3424                            # Annotate command
 3425                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3426
 3427                            # Add command
 3428                            commands[command_annotate] = tmp_annotation_vcf_name
 3429
 3430                if commands:
 3431
 3432                    # Export VCF file
 3433                    self.export_variant_vcf(
 3434                        vcf_file=tmp_vcf_name,
 3435                        remove_info=True,
 3436                        add_samples=False,
 3437                        index=True,
 3438                    )
 3439                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3440
 3441                    # Num command
 3442                    nb_command = 0
 3443
 3444                    # Annotate
 3445                    for command_annotate in commands:
 3446                        nb_command += 1
 3447                        log.info(
 3448                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3449                        )
 3450                        log.debug(f"command_annotate={command_annotate}")
 3451                        run_parallel_commands([command_annotate], threads)
 3452
 3453                        # Debug
 3454                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3455
 3456                        # Update variants
 3457                        log.info(
 3458                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3459                        )
 3460                        self.update_from_vcf(commands[command_annotate])
 3461
 3462    def annotation_bcftools(self, threads: int = None) -> None:
 3463        """
 3464        This function annotate with bcftools
 3465
 3466        :param threads: Number of threads to use
 3467        :return: the value of the variable "return_value".
 3468        """
 3469
 3470        # DEBUG
 3471        log.debug("Start annotation with bcftools databases")
 3472
 3473        # Threads
 3474        if not threads:
 3475            threads = self.get_threads()
 3476        log.debug("Threads: " + str(threads))
 3477
 3478        # Config
 3479        config = self.get_config()
 3480        log.debug("Config: " + str(config))
 3481
 3482        # DEBUG
 3483        delete_tmp = True
 3484        if self.get_config().get("verbosity", "warning") in ["debug"]:
 3485            delete_tmp = False
 3486            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 3487
 3488        # Config - BCFTools bin command
 3489        bcftools_bin_command = get_bin_command(
 3490            bin="bcftools",
 3491            tool="bcftools",
 3492            bin_type="bin",
 3493            config=config,
 3494            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3495        )
 3496        if not bcftools_bin_command:
 3497            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3498            log.error(msg_err)
 3499            raise ValueError(msg_err)
 3500
 3501        # Config - BCFTools databases folders
 3502        databases_folders = set(
 3503            self.get_config()
 3504            .get("folders", {})
 3505            .get("databases", {})
 3506            .get("annotations", ["."])
 3507            + self.get_config()
 3508            .get("folders", {})
 3509            .get("databases", {})
 3510            .get("bcftools", ["."])
 3511        )
 3512        log.debug("Databases annotations: " + str(databases_folders))
 3513
 3514        # Param
 3515        annotations = (
 3516            self.get_param()
 3517            .get("annotation", {})
 3518            .get("bcftools", {})
 3519            .get("annotations", None)
 3520        )
 3521        log.debug("Annotations: " + str(annotations))
 3522
 3523        # Assembly
 3524        assembly = self.get_param().get(
 3525            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3526        )
 3527
 3528        # Data
 3529        table_variants = self.get_table_variants()
 3530
 3531        # Check if not empty
 3532        log.debug("Check if not empty")
 3533        sql_query_chromosomes = (
 3534            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3535        )
 3536        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3537        if not sql_query_chromosomes_df["count"][0]:
 3538            log.info(f"VCF empty")
 3539            return
 3540
 3541        # Export in VCF
 3542        log.debug("Create initial file to annotate")
 3543        tmp_vcf = NamedTemporaryFile(
 3544            prefix=self.get_prefix(),
 3545            dir=self.get_tmp_dir(),
 3546            suffix=".vcf.gz",
 3547            delete=False,
 3548        )
 3549        tmp_vcf_name = tmp_vcf.name
 3550
 3551        # VCF header
 3552        vcf_reader = self.get_header()
 3553        log.debug("Initial header: " + str(vcf_reader.infos))
 3554
 3555        # Existing annotations
 3556        for vcf_annotation in self.get_header().infos:
 3557
 3558            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3559            log.debug(
 3560                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3561            )
 3562
 3563        if annotations:
 3564
 3565            tmp_ann_vcf_list = []
 3566            commands = []
 3567            tmp_files = []
 3568            err_files = []
 3569
 3570            for annotation in annotations:
 3571                annotation_fields = annotations[annotation]
 3572
 3573                # Annotation Name
 3574                annotation_name = os.path.basename(annotation)
 3575
 3576                if not annotation_fields:
 3577                    annotation_fields = {"INFO": None}
 3578
 3579                log.debug(f"Annotation '{annotation_name}'")
 3580                log.debug(
 3581                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3582                )
 3583
 3584                # Create Database
 3585                database = Database(
 3586                    database=annotation,
 3587                    databases_folders=databases_folders,
 3588                    assembly=assembly,
 3589                )
 3590
 3591                # Find files
 3592                db_file = database.get_database()
 3593                db_file = full_path(db_file)
 3594                db_hdr_file = database.get_header_file()
 3595                db_hdr_file = full_path(db_hdr_file)
 3596                db_file_type = database.get_format()
 3597                db_tbi_file = f"{db_file}.tbi"
 3598                db_file_compressed = database.is_compressed()
 3599
 3600                # Check if compressed
 3601                if not db_file_compressed:
 3602                    log.error(
 3603                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3604                    )
 3605                    raise ValueError(
 3606                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3607                    )
 3608
 3609                # Check if indexed
 3610                if not os.path.exists(db_tbi_file):
 3611                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 3612                    raise ValueError(
 3613                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3614                    )
 3615
 3616                # Check index - try to create if not exists
 3617                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3618                    log.error("Annotation failed: database not valid")
 3619                    log.error(f"Annotation annotation file: {db_file}")
 3620                    log.error(f"Annotation annotation header: {db_hdr_file}")
 3621                    log.error(f"Annotation annotation index: {db_tbi_file}")
 3622                    raise ValueError(
 3623                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3624                    )
 3625                else:
 3626
 3627                    log.debug(
 3628                        f"Annotation '{annotation}' - file: "
 3629                        + str(db_file)
 3630                        + " and "
 3631                        + str(db_hdr_file)
 3632                    )
 3633
 3634                    # Load header as VCF object
 3635                    db_hdr_vcf = Variants(input=db_hdr_file)
 3636                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3637                    log.debug(
 3638                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 3639                    )
 3640
 3641                    # For all fields in database
 3642                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3643                        annotation_fields = {
 3644                            key: key for key in db_hdr_vcf_header_infos
 3645                        }
 3646                        log.debug(
 3647                            "Annotation database header - All annotations added: "
 3648                            + str(annotation_fields)
 3649                        )
 3650
 3651                    # Number of fields
 3652                    nb_annotation_field = 0
 3653                    annotation_list = []
 3654
 3655                    for annotation_field in annotation_fields:
 3656
 3657                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3658                        annotation_fields_new_name = annotation_fields.get(
 3659                            annotation_field, annotation_field
 3660                        )
 3661                        if not annotation_fields_new_name:
 3662                            annotation_fields_new_name = annotation_field
 3663
 3664                        # Check if field is in DB and if field is not elready in input data
 3665                        if (
 3666                            annotation_field in db_hdr_vcf.get_header().infos
 3667                            and annotation_fields_new_name
 3668                            not in self.get_header().infos
 3669                        ):
 3670
 3671                            log.info(
 3672                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3673                            )
 3674
 3675                            # Add INFO field to header
 3676                            db_hdr_vcf_header_infos_number = (
 3677                                db_hdr_vcf_header_infos[annotation_field].num or "."
 3678                            )
 3679                            db_hdr_vcf_header_infos_type = (
 3680                                db_hdr_vcf_header_infos[annotation_field].type
 3681                                or "String"
 3682                            )
 3683                            db_hdr_vcf_header_infos_description = (
 3684                                db_hdr_vcf_header_infos[annotation_field].desc
 3685                                or f"{annotation_field} description"
 3686                            )
 3687                            db_hdr_vcf_header_infos_source = (
 3688                                db_hdr_vcf_header_infos[annotation_field].source
 3689                                or "unknown"
 3690                            )
 3691                            db_hdr_vcf_header_infos_version = (
 3692                                db_hdr_vcf_header_infos[annotation_field].version
 3693                                or "unknown"
 3694                            )
 3695
 3696                            vcf_reader.infos[annotation_fields_new_name] = (
 3697                                vcf.parser._Info(
 3698                                    annotation_fields_new_name,
 3699                                    db_hdr_vcf_header_infos_number,
 3700                                    db_hdr_vcf_header_infos_type,
 3701                                    db_hdr_vcf_header_infos_description,
 3702                                    db_hdr_vcf_header_infos_source,
 3703                                    db_hdr_vcf_header_infos_version,
 3704                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 3705                                )
 3706                            )
 3707
 3708                            # annotation_list.append(annotation_field)
 3709                            if annotation_field != annotation_fields_new_name:
 3710                                annotation_list.append(
 3711                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3712                                )
 3713                            else:
 3714                                annotation_list.append(annotation_field)
 3715
 3716                            nb_annotation_field += 1
 3717
 3718                        else:
 3719
 3720                            if annotation_field not in db_hdr_vcf.get_header().infos:
 3721                                log.warning(
 3722                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 3723                                )
 3724                            if annotation_fields_new_name in self.get_header().infos:
 3725                                log.warning(
 3726                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3727                                )
 3728
 3729                    log.info(
 3730                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3731                    )
 3732
 3733                    annotation_infos = ",".join(annotation_list)
 3734
 3735                    if annotation_infos != "":
 3736
 3737                        # Protect header for bcftools (remove "#CHROM" and variants line)
 3738                        log.debug("Protect Header file - remove #CHROM line if exists")
 3739                        tmp_header_vcf = NamedTemporaryFile(
 3740                            prefix=self.get_prefix(),
 3741                            dir=self.get_tmp_dir(),
 3742                            suffix=".hdr",
 3743                            delete=False,
 3744                        )
 3745                        tmp_header_vcf_name = tmp_header_vcf.name
 3746                        tmp_files.append(tmp_header_vcf_name)
 3747                        # Command
 3748                        if db_hdr_file.endswith(".gz"):
 3749                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3750                        else:
 3751                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3752                        # Run
 3753                        run_parallel_commands([command_extract_header], 1)
 3754
 3755                        # Find chomosomes
 3756                        log.debug("Find chromosomes ")
 3757                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 3758                        sql_query_chromosomes_df = self.get_query_to_df(
 3759                            sql_query_chromosomes
 3760                        )
 3761                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 3762
 3763                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 3764
 3765                        # BED columns in the annotation file
 3766                        if db_file_type in ["bed"]:
 3767                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 3768
 3769                        for chrom in chomosomes_list:
 3770
 3771                            # Create BED on initial VCF
 3772                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 3773                            tmp_bed = NamedTemporaryFile(
 3774                                prefix=self.get_prefix(),
 3775                                dir=self.get_tmp_dir(),
 3776                                suffix=".bed",
 3777                                delete=False,
 3778                            )
 3779                            tmp_bed_name = tmp_bed.name
 3780                            tmp_files.append(tmp_bed_name)
 3781
 3782                            # Detecte regions
 3783                            log.debug(
 3784                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 3785                            )
 3786                            window = 1000000
 3787                            sql_query_intervals_for_bed = f"""
 3788                                SELECT  \"#CHROM\",
 3789                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 3790                                        \"POS\"+{window}
 3791                                FROM {table_variants} as table_variants
 3792                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 3793                            """
 3794                            regions = self.conn.execute(
 3795                                sql_query_intervals_for_bed
 3796                            ).fetchall()
 3797                            merged_regions = merge_regions(regions)
 3798                            log.debug(
 3799                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 3800                            )
 3801
 3802                            header = ["#CHROM", "START", "END"]
 3803                            with open(tmp_bed_name, "w") as f:
 3804                                # Write the header with tab delimiter
 3805                                f.write("\t".join(header) + "\n")
 3806                                for d in merged_regions:
 3807                                    # Write each data row with tab delimiter
 3808                                    f.write("\t".join(map(str, d)) + "\n")
 3809
 3810                            # Tmp files
 3811                            tmp_annotation_vcf = NamedTemporaryFile(
 3812                                prefix=self.get_prefix(),
 3813                                dir=self.get_tmp_dir(),
 3814                                suffix=".vcf.gz",
 3815                                delete=False,
 3816                            )
 3817                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 3818                            tmp_files.append(tmp_annotation_vcf_name)
 3819                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 3820                            tmp_annotation_vcf_name_err = (
 3821                                tmp_annotation_vcf_name + ".err"
 3822                            )
 3823                            err_files.append(tmp_annotation_vcf_name_err)
 3824
 3825                            # Annotate Command
 3826                            log.debug(
 3827                                f"Annotation '{annotation}' - add bcftools command"
 3828                            )
 3829
 3830                            # Command
 3831                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3832
 3833                            # Add command
 3834                            commands.append(command_annotate)
 3835
 3836            # if some commands
 3837            if commands:
 3838
 3839                # Export VCF file
 3840                self.export_variant_vcf(
 3841                    vcf_file=tmp_vcf_name,
 3842                    remove_info=True,
 3843                    add_samples=False,
 3844                    index=True,
 3845                )
 3846
 3847                # Threads
 3848                # calculate threads for annotated commands
 3849                if commands:
 3850                    threads_bcftools_annotate = round(threads / len(commands))
 3851                else:
 3852                    threads_bcftools_annotate = 1
 3853
 3854                if not threads_bcftools_annotate:
 3855                    threads_bcftools_annotate = 1
 3856
 3857                # Add threads option to bcftools commands
 3858                if threads_bcftools_annotate > 1:
 3859                    commands_threaded = []
 3860                    for command in commands:
 3861                        commands_threaded.append(
 3862                            command.replace(
 3863                                f"{bcftools_bin_command} annotate ",
 3864                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 3865                            )
 3866                        )
 3867                    commands = commands_threaded
 3868
 3869                # Command annotation multithreading
 3870                log.debug(f"Annotation - Annotation commands: " + str(commands))
 3871                log.info(
 3872                    f"Annotation - Annotation multithreaded in "
 3873                    + str(len(commands))
 3874                    + " commands"
 3875                )
 3876
 3877                run_parallel_commands(commands, threads)
 3878
 3879                # Merge
 3880                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 3881
 3882                if tmp_ann_vcf_list_cmd:
 3883
 3884                    # Tmp file
 3885                    tmp_annotate_vcf = NamedTemporaryFile(
 3886                        prefix=self.get_prefix(),
 3887                        dir=self.get_tmp_dir(),
 3888                        suffix=".vcf.gz",
 3889                        delete=True,
 3890                    )
 3891                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 3892                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 3893                    err_files.append(tmp_annotate_vcf_name_err)
 3894
 3895                    # Tmp file remove command
 3896                    tmp_files_remove_command = ""
 3897                    if tmp_files:
 3898                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 3899
 3900                    # Command merge
 3901                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 3902                    log.info(
 3903                        f"Annotation - Annotation merging "
 3904                        + str(len(commands))
 3905                        + " annotated files"
 3906                    )
 3907                    log.debug(f"Annotation - merge command: {merge_command}")
 3908                    run_parallel_commands([merge_command], 1)
 3909
 3910                    # Error messages
 3911                    log.info(f"Error/Warning messages:")
 3912                    error_message_command_all = []
 3913                    error_message_command_warning = []
 3914                    error_message_command_err = []
 3915                    for err_file in err_files:
 3916                        with open(err_file, "r") as f:
 3917                            for line in f:
 3918                                message = line.strip()
 3919                                error_message_command_all.append(message)
 3920                                if line.startswith("[W::"):
 3921                                    error_message_command_warning.append(message)
 3922                                if line.startswith("[E::"):
 3923                                    error_message_command_err.append(
 3924                                        f"{err_file}: " + message
 3925                                    )
 3926                    # log info
 3927                    for message in list(
 3928                        set(error_message_command_err + error_message_command_warning)
 3929                    ):
 3930                        log.info(f"   {message}")
 3931                    # debug info
 3932                    for message in list(set(error_message_command_all)):
 3933                        log.debug(f"   {message}")
 3934                    # failed
 3935                    if len(error_message_command_err):
 3936                        log.error("Annotation failed: Error in commands")
 3937                        raise ValueError("Annotation failed: Error in commands")
 3938
 3939                    # Update variants
 3940                    log.info(f"Annotation - Updating...")
 3941                    self.update_from_vcf(tmp_annotate_vcf_name)
 3942
 3943    def annotation_exomiser(self, threads: int = None) -> None:
 3944        """
 3945        This function annotate with Exomiser
 3946
 3947        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 3948        - "analysis" (dict/file):
 3949            Full analysis dictionnary parameters (see Exomiser docs).
 3950            Either a dict, or a file in JSON or YAML format.
 3951            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 3952            Default : None
 3953        - "preset" (string):
 3954            Analysis preset (available in config folder).
 3955            Used if no full "analysis" is provided.
 3956            Default: "exome"
 3957        - "phenopacket" (dict/file):
 3958            Samples and phenotipic features parameters (see Exomiser docs).
 3959            Either a dict, or a file in JSON or YAML format.
 3960            Default: None
 3961        - "subject" (dict):
 3962            Sample parameters (see Exomiser docs).
 3963            Example:
 3964                "subject":
 3965                    {
 3966                        "id": "ISDBM322017",
 3967                        "sex": "FEMALE"
 3968                    }
 3969            Default: None
 3970        - "sample" (string):
 3971            Sample name to construct "subject" section:
 3972                "subject":
 3973                    {
 3974                        "id": "<sample>",
 3975                        "sex": "UNKNOWN_SEX"
 3976                    }
 3977            Default: None
 3978        - "phenotypicFeatures" (dict)
 3979            Phenotypic features to construct "subject" section.
 3980            Example:
 3981                "phenotypicFeatures":
 3982                    [
 3983                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 3984                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 3985                    ]
 3986        - "hpo" (list)
 3987            List of HPO ids as phenotypic features.
 3988            Example:
 3989                "hpo": ['0001156', '0001363', '0011304', '0010055']
 3990            Default: []
 3991        - "outputOptions" (dict):
 3992            Output options (see Exomiser docs).
 3993            Default:
 3994                "output_options" =
 3995                    {
 3996                        "outputContributingVariantsOnly": False,
 3997                        "numGenes": 0,
 3998                        "outputFormats": ["TSV_VARIANT", "VCF"]
 3999                    }
 4000        - "transcript_source" (string):
 4001            Transcript source (either "refseq", "ucsc", "ensembl")
 4002            Default: "refseq"
 4003        - "exomiser_to_info" (boolean):
 4004            Add exomiser TSV file columns as INFO fields in VCF.
 4005            Default: False
 4006        - "release" (string):
 4007            Exomise database release.
 4008            If not exists, database release will be downloaded (take a while).
 4009            Default: None (provided by application.properties configuration file)
 4010        - "exomiser_application_properties" (file):
 4011            Exomiser configuration file (see Exomiser docs).
 4012            Useful to automatically download databases (especially for specific genome databases).
 4013
 4014        Notes:
 4015        - If no sample in parameters, first sample in VCF will be chosen
 4016        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4017
 4018        :param threads: The number of threads to use
 4019        :return: None.
 4020        """
 4021
 4022        # DEBUG
 4023        log.debug("Start annotation with Exomiser databases")
 4024
 4025        # Threads
 4026        if not threads:
 4027            threads = self.get_threads()
 4028        log.debug("Threads: " + str(threads))
 4029
 4030        # Config
 4031        config = self.get_config()
 4032        log.debug("Config: " + str(config))
 4033
 4034        # Config - Folders - Databases
 4035        databases_folders = (
 4036            config.get("folders", {})
 4037            .get("databases", {})
 4038            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4039        )
 4040        databases_folders = full_path(databases_folders)
 4041        if not os.path.exists(databases_folders):
 4042            log.error(f"Databases annotations: {databases_folders} NOT found")
 4043        log.debug("Databases annotations: " + str(databases_folders))
 4044
 4045        # Config - Exomiser
 4046        exomiser_bin_command = get_bin_command(
 4047            bin="exomiser-cli*.jar",
 4048            tool="exomiser",
 4049            bin_type="jar",
 4050            config=config,
 4051            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4052        )
 4053        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4054        if not exomiser_bin_command:
 4055            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4056            log.error(msg_err)
 4057            raise ValueError(msg_err)
 4058
 4059        # Param
 4060        param = self.get_param()
 4061        log.debug("Param: " + str(param))
 4062
 4063        # Param - Exomiser
 4064        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4065        log.debug(f"Param Exomiser: {param_exomiser}")
 4066
 4067        # Param - Assembly
 4068        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4069        log.debug("Assembly: " + str(assembly))
 4070
 4071        # Data
 4072        table_variants = self.get_table_variants()
 4073
 4074        # Check if not empty
 4075        log.debug("Check if not empty")
 4076        sql_query_chromosomes = (
 4077            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4078        )
 4079        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4080            log.info(f"VCF empty")
 4081            return False
 4082
 4083        # VCF header
 4084        vcf_reader = self.get_header()
 4085        log.debug("Initial header: " + str(vcf_reader.infos))
 4086
 4087        # Samples
 4088        samples = self.get_header_sample_list()
 4089        if not samples:
 4090            log.error("No Samples in VCF")
 4091            return False
 4092        log.debug(f"Samples: {samples}")
 4093
 4094        # Memory limit
 4095        memory_limit = self.get_memory("8G")
 4096        log.debug(f"memory_limit: {memory_limit}")
 4097
 4098        # Exomiser java options
 4099        exomiser_java_options = (
 4100            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4101        )
 4102        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4103
 4104        # Download Exomiser (if not exists)
 4105        exomiser_release = param_exomiser.get("release", None)
 4106        exomiser_application_properties = param_exomiser.get(
 4107            "exomiser_application_properties", None
 4108        )
 4109        databases_download_exomiser(
 4110            assemblies=[assembly],
 4111            exomiser_folder=databases_folders,
 4112            exomiser_release=exomiser_release,
 4113            exomiser_phenotype_release=exomiser_release,
 4114            exomiser_application_properties=exomiser_application_properties,
 4115        )
 4116
 4117        # Force annotation
 4118        force_update_annotation = True
 4119
 4120        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4121            log.debug("Start annotation Exomiser")
 4122
 4123            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4124
 4125                # tmp_dir = "/tmp/exomiser"
 4126
 4127                ### ANALYSIS ###
 4128                ################
 4129
 4130                # Create analysis.json through analysis dict
 4131                # either analysis in param or by default
 4132                # depending on preset exome/genome)
 4133
 4134                # Init analysis dict
 4135                param_exomiser_analysis_dict = {}
 4136
 4137                # analysis from param
 4138                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4139                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4140
 4141                # If analysis in param -> load anlaysis json
 4142                if param_exomiser_analysis:
 4143
 4144                    # If param analysis is a file and exists
 4145                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4146                        param_exomiser_analysis
 4147                    ):
 4148                        # Load analysis file into analysis dict (either yaml or json)
 4149                        with open(param_exomiser_analysis) as json_file:
 4150                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4151
 4152                    # If param analysis is a dict
 4153                    elif isinstance(param_exomiser_analysis, dict):
 4154                        # Load analysis dict into analysis dict (either yaml or json)
 4155                        param_exomiser_analysis_dict = param_exomiser_analysis
 4156
 4157                    # Error analysis type
 4158                    else:
 4159                        log.error(f"Analysis type unknown. Check param file.")
 4160                        raise ValueError(f"Analysis type unknown. Check param file.")
 4161
 4162                # Case no input analysis config file/dict
 4163                # Use preset (exome/genome) to open default config file
 4164                if not param_exomiser_analysis_dict:
 4165
 4166                    # default preset
 4167                    default_preset = "exome"
 4168
 4169                    # Get param preset or default preset
 4170                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4171
 4172                    # Try to find if preset is a file
 4173                    if os.path.exists(param_exomiser_preset):
 4174                        # Preset file is provided in full path
 4175                        param_exomiser_analysis_default_config_file = (
 4176                            param_exomiser_preset
 4177                        )
 4178                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4179                    #     # Preset file is provided in full path
 4180                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4181                    elif os.path.exists(
 4182                        os.path.join(folder_config, param_exomiser_preset)
 4183                    ):
 4184                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4185                        param_exomiser_analysis_default_config_file = os.path.join(
 4186                            folder_config, param_exomiser_preset
 4187                        )
 4188                    else:
 4189                        # Construct preset file
 4190                        param_exomiser_analysis_default_config_file = os.path.join(
 4191                            folder_config,
 4192                            f"preset-{param_exomiser_preset}-analysis.json",
 4193                        )
 4194
 4195                    # If preset file exists
 4196                    param_exomiser_analysis_default_config_file = full_path(
 4197                        param_exomiser_analysis_default_config_file
 4198                    )
 4199                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4200                        # Load prest file into analysis dict (either yaml or json)
 4201                        with open(
 4202                            param_exomiser_analysis_default_config_file
 4203                        ) as json_file:
 4204                            # param_exomiser_analysis_dict[""] = json.load(json_file)
 4205                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4206                                json_file
 4207                            )
 4208
 4209                    # Error preset file
 4210                    else:
 4211                        log.error(
 4212                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4213                        )
 4214                        raise ValueError(
 4215                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4216                        )
 4217
 4218                # If no analysis dict created
 4219                if not param_exomiser_analysis_dict:
 4220                    log.error(f"No analysis config")
 4221                    raise ValueError(f"No analysis config")
 4222
 4223                # Log
 4224                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4225
 4226                ### PHENOPACKET ###
 4227                ###################
 4228
 4229                # If no PhenoPacket in analysis dict -> check in param
 4230                if "phenopacket" not in param_exomiser_analysis_dict:
 4231
 4232                    # If PhenoPacket in param -> load anlaysis json
 4233                    if param_exomiser.get("phenopacket", None):
 4234
 4235                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4236                        param_exomiser_phenopacket = full_path(
 4237                            param_exomiser_phenopacket
 4238                        )
 4239
 4240                        # If param phenopacket is a file and exists
 4241                        if isinstance(
 4242                            param_exomiser_phenopacket, str
 4243                        ) and os.path.exists(param_exomiser_phenopacket):
 4244                            # Load phenopacket file into analysis dict (either yaml or json)
 4245                            with open(param_exomiser_phenopacket) as json_file:
 4246                                param_exomiser_analysis_dict["phenopacket"] = (
 4247                                    yaml.safe_load(json_file)
 4248                                )
 4249
 4250                        # If param phenopacket is a dict
 4251                        elif isinstance(param_exomiser_phenopacket, dict):
 4252                            # Load phenopacket dict into analysis dict (either yaml or json)
 4253                            param_exomiser_analysis_dict["phenopacket"] = (
 4254                                param_exomiser_phenopacket
 4255                            )
 4256
 4257                        # Error phenopacket type
 4258                        else:
 4259                            log.error(f"Phenopacket type unknown. Check param file.")
 4260                            raise ValueError(
 4261                                f"Phenopacket type unknown. Check param file."
 4262                            )
 4263
 4264                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4265                if "phenopacket" not in param_exomiser_analysis_dict:
 4266
 4267                    # Init PhenoPacket
 4268                    param_exomiser_analysis_dict["phenopacket"] = {
 4269                        "id": "analysis",
 4270                        "proband": {},
 4271                    }
 4272
 4273                    ### Add subject ###
 4274
 4275                    # If subject exists
 4276                    param_exomiser_subject = param_exomiser.get("subject", {})
 4277
 4278                    # If subject not exists -> found sample ID
 4279                    if not param_exomiser_subject:
 4280
 4281                        # Found sample ID in param
 4282                        sample = param_exomiser.get("sample", None)
 4283
 4284                        # Find sample ID (first sample)
 4285                        if not sample:
 4286                            sample_list = self.get_header_sample_list()
 4287                            if len(sample_list) > 0:
 4288                                sample = sample_list[0]
 4289                            else:
 4290                                log.error(f"No sample found")
 4291                                raise ValueError(f"No sample found")
 4292
 4293                        # Create subject
 4294                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4295
 4296                    # Add to dict
 4297                    param_exomiser_analysis_dict["phenopacket"][
 4298                        "subject"
 4299                    ] = param_exomiser_subject
 4300
 4301                    ### Add "phenotypicFeatures" ###
 4302
 4303                    # If phenotypicFeatures exists
 4304                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4305                        "phenotypicFeatures", []
 4306                    )
 4307
 4308                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4309                    if not param_exomiser_phenotypicfeatures:
 4310
 4311                        # Found HPO in param
 4312                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4313
 4314                        # Split HPO if list in string format separated by comma
 4315                        if isinstance(param_exomiser_hpo, str):
 4316                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4317
 4318                        # Create HPO list
 4319                        for hpo in param_exomiser_hpo:
 4320                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4321                            param_exomiser_phenotypicfeatures.append(
 4322                                {
 4323                                    "type": {
 4324                                        "id": f"HP:{hpo_clean}",
 4325                                        "label": f"HP:{hpo_clean}",
 4326                                    }
 4327                                }
 4328                            )
 4329
 4330                    # Add to dict
 4331                    param_exomiser_analysis_dict["phenopacket"][
 4332                        "phenotypicFeatures"
 4333                    ] = param_exomiser_phenotypicfeatures
 4334
 4335                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4336                    if not param_exomiser_phenotypicfeatures:
 4337                        for step in param_exomiser_analysis_dict.get(
 4338                            "analysis", {}
 4339                        ).get("steps", []):
 4340                            if "hiPhivePrioritiser" in step:
 4341                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4342                                    "steps", []
 4343                                ).remove(step)
 4344
 4345                ### Add Input File ###
 4346
 4347                # Initial file name and htsFiles
 4348                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4349                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4350                    {
 4351                        "uri": tmp_vcf_name,
 4352                        "htsFormat": "VCF",
 4353                        "genomeAssembly": assembly,
 4354                    }
 4355                ]
 4356
 4357                ### Add metaData ###
 4358
 4359                # If metaData not in analysis dict
 4360                if "metaData" not in param_exomiser_analysis_dict:
 4361                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4362                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4363                        "createdBy": "howard",
 4364                        "phenopacketSchemaVersion": 1,
 4365                    }
 4366
 4367                ### OutputOptions ###
 4368
 4369                # Init output result folder
 4370                output_results = os.path.join(tmp_dir, "results")
 4371
 4372                # If no outputOptions in analysis dict
 4373                if "outputOptions" not in param_exomiser_analysis_dict:
 4374
 4375                    # default output formats
 4376                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4377
 4378                    # Get outputOptions in param
 4379                    output_options = param_exomiser.get("outputOptions", None)
 4380
 4381                    # If no output_options in param -> check
 4382                    if not output_options:
 4383                        output_options = {
 4384                            "outputContributingVariantsOnly": False,
 4385                            "numGenes": 0,
 4386                            "outputFormats": defaut_output_formats,
 4387                        }
 4388
 4389                    # Replace outputDirectory in output options
 4390                    output_options["outputDirectory"] = output_results
 4391                    output_options["outputFileName"] = "howard"
 4392
 4393                    # Add outputOptions in analysis dict
 4394                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4395
 4396                else:
 4397
 4398                    # Replace output_results and output format (if exists in param)
 4399                    param_exomiser_analysis_dict["outputOptions"][
 4400                        "outputDirectory"
 4401                    ] = output_results
 4402                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4403                        list(
 4404                            set(
 4405                                param_exomiser_analysis_dict.get(
 4406                                    "outputOptions", {}
 4407                                ).get("outputFormats", [])
 4408                                + ["TSV_VARIANT", "VCF"]
 4409                            )
 4410                        )
 4411                    )
 4412
 4413                # log
 4414                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4415
 4416                ### ANALYSIS FILE ###
 4417                #####################
 4418
 4419                ### Full JSON analysis config file ###
 4420
 4421                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4422                with open(exomiser_analysis, "w") as fp:
 4423                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4424
 4425                ### SPLIT analysis and sample config files
 4426
 4427                # Splitted analysis dict
 4428                param_exomiser_analysis_dict_for_split = (
 4429                    param_exomiser_analysis_dict.copy()
 4430                )
 4431
 4432                # Phenopacket JSON file
 4433                exomiser_analysis_phenopacket = os.path.join(
 4434                    tmp_dir, "analysis_phenopacket.json"
 4435                )
 4436                with open(exomiser_analysis_phenopacket, "w") as fp:
 4437                    json.dump(
 4438                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4439                        fp,
 4440                        indent=4,
 4441                    )
 4442
 4443                # Analysis JSON file without Phenopacket parameters
 4444                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4445                exomiser_analysis_analysis = os.path.join(
 4446                    tmp_dir, "analysis_analysis.json"
 4447                )
 4448                with open(exomiser_analysis_analysis, "w") as fp:
 4449                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4450
 4451                ### INITAL VCF file ###
 4452                #######################
 4453
 4454                ### Create list of samples to use and include inti initial VCF file ####
 4455
 4456                # Subject (main sample)
 4457                # Get sample ID in analysis dict
 4458                sample_subject = (
 4459                    param_exomiser_analysis_dict.get("phenopacket", {})
 4460                    .get("subject", {})
 4461                    .get("id", None)
 4462                )
 4463                sample_proband = (
 4464                    param_exomiser_analysis_dict.get("phenopacket", {})
 4465                    .get("proband", {})
 4466                    .get("subject", {})
 4467                    .get("id", None)
 4468                )
 4469                sample = []
 4470                if sample_subject:
 4471                    sample.append(sample_subject)
 4472                if sample_proband:
 4473                    sample.append(sample_proband)
 4474
 4475                # Get sample ID within Pedigree
 4476                pedigree_persons_list = (
 4477                    param_exomiser_analysis_dict.get("phenopacket", {})
 4478                    .get("pedigree", {})
 4479                    .get("persons", {})
 4480                )
 4481
 4482                # Create list with all sample ID in pedigree (if exists)
 4483                pedigree_persons = []
 4484                for person in pedigree_persons_list:
 4485                    pedigree_persons.append(person.get("individualId"))
 4486
 4487                # Concat subject sample ID and samples ID in pedigreesamples
 4488                samples = list(set(sample + pedigree_persons))
 4489
 4490                # Check if sample list is not empty
 4491                if not samples:
 4492                    log.error(f"No samples found")
 4493                    raise ValueError(f"No samples found")
 4494
 4495                # Create VCF with sample (either sample in param or first one by default)
 4496                # Export VCF file
 4497                self.export_variant_vcf(
 4498                    vcf_file=tmp_vcf_name,
 4499                    remove_info=True,
 4500                    add_samples=True,
 4501                    list_samples=samples,
 4502                    index=False,
 4503                )
 4504
 4505                ### Execute Exomiser ###
 4506                ########################
 4507
 4508                # Init command
 4509                exomiser_command = ""
 4510
 4511                # Command exomiser options
 4512                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 4513
 4514                # Release
 4515                exomiser_release = param_exomiser.get("release", None)
 4516                if exomiser_release:
 4517                    # phenotype data version
 4518                    exomiser_options += (
 4519                        f" --exomiser.phenotype.data-version={exomiser_release} "
 4520                    )
 4521                    # data version
 4522                    exomiser_options += (
 4523                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 4524                    )
 4525                    # variant white list
 4526                    variant_white_list_file = (
 4527                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 4528                    )
 4529                    if os.path.exists(
 4530                        os.path.join(
 4531                            databases_folders, assembly, variant_white_list_file
 4532                        )
 4533                    ):
 4534                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 4535
 4536                # transcript_source
 4537                transcript_source = param_exomiser.get(
 4538                    "transcript_source", None
 4539                )  # ucsc, refseq, ensembl
 4540                if transcript_source:
 4541                    exomiser_options += (
 4542                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 4543                    )
 4544
 4545                # If analysis contain proband param
 4546                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 4547                    "proband", {}
 4548                ):
 4549                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 4550
 4551                # If no proband (usually uniq sample)
 4552                else:
 4553                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 4554
 4555                # Log
 4556                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 4557
 4558                # Run command
 4559                result = subprocess.call(
 4560                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 4561                )
 4562                if result:
 4563                    log.error("Exomiser command failed")
 4564                    raise ValueError("Exomiser command failed")
 4565
 4566                ### RESULTS ###
 4567                ###############
 4568
 4569                ### Annotate with TSV fields ###
 4570
 4571                # Init result tsv file
 4572                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 4573
 4574                # Init result tsv file
 4575                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 4576
 4577                # Parse TSV file and explode columns in INFO field
 4578                if exomiser_to_info and os.path.exists(output_results_tsv):
 4579
 4580                    # Log
 4581                    log.debug("Exomiser columns to VCF INFO field")
 4582
 4583                    # Retrieve columns and types
 4584                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 4585                    output_results_tsv_df = self.get_query_to_df(query)
 4586                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 4587
 4588                    # Init concat fields for update
 4589                    sql_query_update_concat_fields = []
 4590
 4591                    # Fields to avoid
 4592                    fields_to_avoid = [
 4593                        "CONTIG",
 4594                        "START",
 4595                        "END",
 4596                        "REF",
 4597                        "ALT",
 4598                        "QUAL",
 4599                        "FILTER",
 4600                        "GENOTYPE",
 4601                    ]
 4602
 4603                    # List all columns to add into header
 4604                    for header_column in output_results_tsv_columns:
 4605
 4606                        # If header column is enable
 4607                        if header_column not in fields_to_avoid:
 4608
 4609                            # Header info type
 4610                            header_info_type = "String"
 4611                            header_column_df = output_results_tsv_df[header_column]
 4612                            header_column_df_dtype = header_column_df.dtype
 4613                            if header_column_df_dtype == object:
 4614                                if (
 4615                                    pd.to_numeric(header_column_df, errors="coerce")
 4616                                    .notnull()
 4617                                    .all()
 4618                                ):
 4619                                    header_info_type = "Float"
 4620                            else:
 4621                                header_info_type = "Integer"
 4622
 4623                            # Header info
 4624                            characters_to_validate = ["-"]
 4625                            pattern = "[" + "".join(characters_to_validate) + "]"
 4626                            header_info_name = re.sub(
 4627                                pattern,
 4628                                "_",
 4629                                f"Exomiser_{header_column}".replace("#", ""),
 4630                            )
 4631                            header_info_number = "."
 4632                            header_info_description = (
 4633                                f"Exomiser {header_column} annotation"
 4634                            )
 4635                            header_info_source = "Exomiser"
 4636                            header_info_version = "unknown"
 4637                            header_info_code = CODE_TYPE_MAP[header_info_type]
 4638                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 4639                                header_info_name,
 4640                                header_info_number,
 4641                                header_info_type,
 4642                                header_info_description,
 4643                                header_info_source,
 4644                                header_info_version,
 4645                                header_info_code,
 4646                            )
 4647
 4648                            # Add field to add for update to concat fields
 4649                            sql_query_update_concat_fields.append(
 4650                                f"""
 4651                                CASE
 4652                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 4653                                    THEN concat(
 4654                                        '{header_info_name}=',
 4655                                        table_parquet."{header_column}",
 4656                                        ';'
 4657                                        )
 4658
 4659                                    ELSE ''
 4660                                END
 4661                            """
 4662                            )
 4663
 4664                    # Update query
 4665                    sql_query_update = f"""
 4666                        UPDATE {table_variants} as table_variants
 4667                            SET INFO = concat(
 4668                                            CASE
 4669                                                WHEN INFO NOT IN ('', '.')
 4670                                                THEN INFO
 4671                                                ELSE ''
 4672                                            END,
 4673                                            CASE
 4674                                                WHEN table_variants.INFO NOT IN ('','.')
 4675                                                THEN ';'
 4676                                                ELSE ''
 4677                                            END,
 4678                                            (
 4679                                            SELECT 
 4680                                                concat(
 4681                                                    {",".join(sql_query_update_concat_fields)}
 4682                                                )
 4683                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 4684                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 4685                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 4686                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 4687                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 4688                                            )
 4689                                        )
 4690                            ;
 4691                        """
 4692
 4693                    # Update
 4694                    self.conn.execute(sql_query_update)
 4695
 4696                ### Annotate with VCF INFO field ###
 4697
 4698                # Init result VCF file
 4699                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 4700
 4701                # If VCF exists
 4702                if os.path.exists(output_results_vcf):
 4703
 4704                    # Log
 4705                    log.debug("Exomiser result VCF update variants")
 4706
 4707                    # Find Exomiser INFO field annotation in header
 4708                    with gzip.open(output_results_vcf, "rt") as f:
 4709                        header_list = self.read_vcf_header(f)
 4710                    exomiser_vcf_header = vcf.Reader(
 4711                        io.StringIO("\n".join(header_list))
 4712                    )
 4713
 4714                    # Add annotation INFO field to header
 4715                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 4716
 4717                    # Update variants with VCF
 4718                    self.update_from_vcf(output_results_vcf)
 4719
 4720        return True
 4721
 4722    def annotation_snpeff(self, threads: int = None) -> None:
 4723        """
 4724        This function annotate with snpEff
 4725
 4726        :param threads: The number of threads to use
 4727        :return: the value of the variable "return_value".
 4728        """
 4729
 4730        # DEBUG
 4731        log.debug("Start annotation with snpeff databases")
 4732
 4733        # Threads
 4734        if not threads:
 4735            threads = self.get_threads()
 4736        log.debug("Threads: " + str(threads))
 4737
 4738        # DEBUG
 4739        delete_tmp = True
 4740        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4741            delete_tmp = False
 4742            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4743
 4744        # Config
 4745        config = self.get_config()
 4746        log.debug("Config: " + str(config))
 4747
 4748        # Config - Folders - Databases
 4749        databases_folders = (
 4750            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 4751        )
 4752        log.debug("Databases annotations: " + str(databases_folders))
 4753
 4754        # # Config - Java
 4755        # java_bin = get_bin(
 4756        #     tool="java",
 4757        #     bin="java",
 4758        #     bin_type="bin",
 4759        #     config=config,
 4760        #     default_folder="/usr/bin",
 4761        # )
 4762        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
 4763        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
 4764        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
 4765
 4766        # # Config - snpEff bin
 4767        # snpeff_jar = get_bin(
 4768        #     tool="snpeff",
 4769        #     bin="snpEff.jar",
 4770        #     bin_type="jar",
 4771        #     config=config,
 4772        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4773        # )
 4774        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
 4775        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4776        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4777
 4778        # Config - snpEff bin command
 4779        snpeff_bin_command = get_bin_command(
 4780            bin="snpEff.jar",
 4781            tool="snpeff",
 4782            bin_type="jar",
 4783            config=config,
 4784            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4785        )
 4786        if not snpeff_bin_command:
 4787            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 4788            log.error(msg_err)
 4789            raise ValueError(msg_err)
 4790
 4791        # Config - snpEff databases
 4792        snpeff_databases = (
 4793            config.get("folders", {})
 4794            .get("databases", {})
 4795            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 4796        )
 4797        snpeff_databases = full_path(snpeff_databases)
 4798        if snpeff_databases is not None and snpeff_databases != "":
 4799            log.debug(f"Create snpEff databases folder")
 4800            if not os.path.exists(snpeff_databases):
 4801                os.makedirs(snpeff_databases)
 4802
 4803        # Param
 4804        param = self.get_param()
 4805        log.debug("Param: " + str(param))
 4806
 4807        # Param
 4808        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 4809        log.debug("Options: " + str(options))
 4810
 4811        # Param - Assembly
 4812        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4813
 4814        # Param - Options
 4815        snpeff_options = (
 4816            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 4817        )
 4818        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 4819        snpeff_csvstats = (
 4820            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 4821        )
 4822        if snpeff_stats:
 4823            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 4824            snpeff_stats = full_path(snpeff_stats)
 4825            snpeff_options += f" -stats {snpeff_stats}"
 4826        if snpeff_csvstats:
 4827            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 4828            snpeff_csvstats = full_path(snpeff_csvstats)
 4829            snpeff_options += f" -csvStats {snpeff_csvstats}"
 4830
 4831        # Data
 4832        table_variants = self.get_table_variants()
 4833
 4834        # Check if not empty
 4835        log.debug("Check if not empty")
 4836        sql_query_chromosomes = (
 4837            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4838        )
 4839        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 4840        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4841            log.info(f"VCF empty")
 4842            return
 4843
 4844        # Export in VCF
 4845        log.debug("Create initial file to annotate")
 4846        tmp_vcf = NamedTemporaryFile(
 4847            prefix=self.get_prefix(),
 4848            dir=self.get_tmp_dir(),
 4849            suffix=".vcf.gz",
 4850            delete=True,
 4851        )
 4852        tmp_vcf_name = tmp_vcf.name
 4853
 4854        # VCF header
 4855        vcf_reader = self.get_header()
 4856        log.debug("Initial header: " + str(vcf_reader.infos))
 4857
 4858        # Existing annotations
 4859        for vcf_annotation in self.get_header().infos:
 4860
 4861            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4862            log.debug(
 4863                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4864            )
 4865
 4866        # Memory limit
 4867        # if config.get("memory", None):
 4868        #     memory_limit = config.get("memory", "8G")
 4869        # else:
 4870        #     memory_limit = "8G"
 4871        memory_limit = self.get_memory("8G")
 4872        log.debug(f"memory_limit: {memory_limit}")
 4873
 4874        # snpEff java options
 4875        snpeff_java_options = (
 4876            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4877        )
 4878        log.debug(f"Exomiser java options: {snpeff_java_options}")
 4879
 4880        force_update_annotation = True
 4881
 4882        if "ANN" not in self.get_header().infos or force_update_annotation:
 4883
 4884            # Check snpEff database
 4885            log.debug(f"Check snpEff databases {[assembly]}")
 4886            databases_download_snpeff(
 4887                folder=snpeff_databases, assemblies=[assembly], config=config
 4888            )
 4889
 4890            # Export VCF file
 4891            self.export_variant_vcf(
 4892                vcf_file=tmp_vcf_name,
 4893                remove_info=True,
 4894                add_samples=False,
 4895                index=True,
 4896            )
 4897
 4898            # Tmp file
 4899            err_files = []
 4900            tmp_annotate_vcf = NamedTemporaryFile(
 4901                prefix=self.get_prefix(),
 4902                dir=self.get_tmp_dir(),
 4903                suffix=".vcf",
 4904                delete=False,
 4905            )
 4906            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4907            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4908            err_files.append(tmp_annotate_vcf_name_err)
 4909
 4910            # Command
 4911            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 4912            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 4913            run_parallel_commands([snpeff_command], 1)
 4914
 4915            # Error messages
 4916            log.info(f"Error/Warning messages:")
 4917            error_message_command_all = []
 4918            error_message_command_warning = []
 4919            error_message_command_err = []
 4920            for err_file in err_files:
 4921                with open(err_file, "r") as f:
 4922                    for line in f:
 4923                        message = line.strip()
 4924                        error_message_command_all.append(message)
 4925                        if line.startswith("[W::"):
 4926                            error_message_command_warning.append(message)
 4927                        if line.startswith("[E::"):
 4928                            error_message_command_err.append(f"{err_file}: " + message)
 4929            # log info
 4930            for message in list(
 4931                set(error_message_command_err + error_message_command_warning)
 4932            ):
 4933                log.info(f"   {message}")
 4934            # debug info
 4935            for message in list(set(error_message_command_all)):
 4936                log.debug(f"   {message}")
 4937            # failed
 4938            if len(error_message_command_err):
 4939                log.error("Annotation failed: Error in commands")
 4940                raise ValueError("Annotation failed: Error in commands")
 4941
 4942            # Find annotation in header
 4943            with open(tmp_annotate_vcf_name, "rt") as f:
 4944                header_list = self.read_vcf_header(f)
 4945            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 4946
 4947            for ann in annovar_vcf_header.infos:
 4948                if ann not in self.get_header().infos:
 4949                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 4950
 4951            # Update variants
 4952            log.info(f"Annotation - Updating...")
 4953            self.update_from_vcf(tmp_annotate_vcf_name)
 4954
 4955        else:
 4956            if "ANN" in self.get_header().infos:
 4957                log.debug(f"Existing snpEff annotations in VCF")
 4958            if force_update_annotation:
 4959                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 4960
 4961    def annotation_annovar(self, threads: int = None) -> None:
 4962        """
 4963        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 4964        annotations
 4965
 4966        :param threads: number of threads to use
 4967        :return: the value of the variable "return_value".
 4968        """
 4969
 4970        # DEBUG
 4971        log.debug("Start annotation with Annovar databases")
 4972
 4973        # Threads
 4974        if not threads:
 4975            threads = self.get_threads()
 4976        log.debug("Threads: " + str(threads))
 4977
 4978        # Tmp en Err files
 4979        tmp_files = []
 4980        err_files = []
 4981
 4982        # DEBUG
 4983        delete_tmp = True
 4984        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4985            delete_tmp = False
 4986            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4987
 4988        # Config
 4989        config = self.get_config()
 4990        log.debug("Config: " + str(config))
 4991
 4992        # Config - Folders - Databases
 4993        databases_folders = (
 4994            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 4995        )
 4996        log.debug("Databases annotations: " + str(databases_folders))
 4997
 4998        # Config - annovar bin command
 4999        annovar_bin_command = get_bin_command(
 5000            bin="table_annovar.pl",
 5001            tool="annovar",
 5002            bin_type="perl",
 5003            config=config,
 5004            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5005        )
 5006        if not annovar_bin_command:
 5007            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5008            log.error(msg_err)
 5009            raise ValueError(msg_err)
 5010
 5011        # Config - BCFTools bin command
 5012        bcftools_bin_command = get_bin_command(
 5013            bin="bcftools",
 5014            tool="bcftools",
 5015            bin_type="bin",
 5016            config=config,
 5017            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5018        )
 5019        if not bcftools_bin_command:
 5020            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5021            log.error(msg_err)
 5022            raise ValueError(msg_err)
 5023
 5024        # Config - annovar databases
 5025        annovar_databases = (
 5026            config.get("folders", {})
 5027            .get("databases", {})
 5028            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5029        )
 5030        annovar_databases = full_path(annovar_databases)
 5031        if annovar_databases != "" and not os.path.exists(annovar_databases):
 5032            os.makedirs(annovar_databases)
 5033
 5034        # Param
 5035        param = self.get_param()
 5036        log.debug("Param: " + str(param))
 5037
 5038        # Param - options
 5039        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5040        log.debug("Options: " + str(options))
 5041
 5042        # Param - annotations
 5043        annotations = (
 5044            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5045        )
 5046        log.debug("Annotations: " + str(annotations))
 5047
 5048        # Param - Assembly
 5049        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5050
 5051        # Annovar database assembly
 5052        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5053        if annovar_databases_assembly != "" and not os.path.exists(
 5054            annovar_databases_assembly
 5055        ):
 5056            os.makedirs(annovar_databases_assembly)
 5057
 5058        # Data
 5059        table_variants = self.get_table_variants()
 5060
 5061        # Check if not empty
 5062        log.debug("Check if not empty")
 5063        sql_query_chromosomes = (
 5064            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5065        )
 5066        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5067        if not sql_query_chromosomes_df["count"][0]:
 5068            log.info(f"VCF empty")
 5069            return
 5070
 5071        # VCF header
 5072        vcf_reader = self.get_header()
 5073        log.debug("Initial header: " + str(vcf_reader.infos))
 5074
 5075        # Existing annotations
 5076        for vcf_annotation in self.get_header().infos:
 5077
 5078            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5079            log.debug(
 5080                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5081            )
 5082
 5083        force_update_annotation = True
 5084
 5085        if annotations:
 5086
 5087            commands = []
 5088            tmp_annotates_vcf_name_list = []
 5089
 5090            # Export in VCF
 5091            log.debug("Create initial file to annotate")
 5092            tmp_vcf = NamedTemporaryFile(
 5093                prefix=self.get_prefix(),
 5094                dir=self.get_tmp_dir(),
 5095                suffix=".vcf.gz",
 5096                delete=False,
 5097            )
 5098            tmp_vcf_name = tmp_vcf.name
 5099            tmp_files.append(tmp_vcf_name)
 5100            tmp_files.append(tmp_vcf_name + ".tbi")
 5101
 5102            # Export VCF file
 5103            self.export_variant_vcf(
 5104                vcf_file=tmp_vcf_name,
 5105                remove_info=".",
 5106                add_samples=False,
 5107                index=True,
 5108            )
 5109
 5110            # Create file for field rename
 5111            log.debug("Create file for field rename")
 5112            tmp_rename = NamedTemporaryFile(
 5113                prefix=self.get_prefix(),
 5114                dir=self.get_tmp_dir(),
 5115                suffix=".rename",
 5116                delete=False,
 5117            )
 5118            tmp_rename_name = tmp_rename.name
 5119            tmp_files.append(tmp_rename_name)
 5120
 5121            # Check Annovar database
 5122            log.debug(
 5123                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5124            )
 5125            databases_download_annovar(
 5126                folder=annovar_databases,
 5127                files=list(annotations.keys()),
 5128                assemblies=[assembly],
 5129            )
 5130
 5131            for annotation in annotations:
 5132                annotation_fields = annotations[annotation]
 5133
 5134                if not annotation_fields:
 5135                    annotation_fields = {"INFO": None}
 5136
 5137                log.info(f"Annotations Annovar - database '{annotation}'")
 5138                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5139
 5140                # Tmp file for annovar
 5141                err_files = []
 5142                tmp_annotate_vcf_directory = TemporaryDirectory(
 5143                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5144                )
 5145                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5146                tmp_annotate_vcf_name_annovar = (
 5147                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5148                )
 5149                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5150                err_files.append(tmp_annotate_vcf_name_err)
 5151                tmp_files.append(tmp_annotate_vcf_name_err)
 5152
 5153                # Tmp file final vcf annotated by annovar
 5154                tmp_annotate_vcf = NamedTemporaryFile(
 5155                    prefix=self.get_prefix(),
 5156                    dir=self.get_tmp_dir(),
 5157                    suffix=".vcf.gz",
 5158                    delete=False,
 5159                )
 5160                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5161                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5162                tmp_files.append(tmp_annotate_vcf_name)
 5163                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5164
 5165                # Number of fields
 5166                annotation_list = []
 5167                annotation_renamed_list = []
 5168
 5169                for annotation_field in annotation_fields:
 5170
 5171                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5172                    annotation_fields_new_name = annotation_fields.get(
 5173                        annotation_field, annotation_field
 5174                    )
 5175                    if not annotation_fields_new_name:
 5176                        annotation_fields_new_name = annotation_field
 5177
 5178                    if (
 5179                        force_update_annotation
 5180                        or annotation_fields_new_name not in self.get_header().infos
 5181                    ):
 5182                        annotation_list.append(annotation_field)
 5183                        annotation_renamed_list.append(annotation_fields_new_name)
 5184                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5185                        log.warning(
 5186                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5187                        )
 5188
 5189                    # Add rename info
 5190                    run_parallel_commands(
 5191                        [
 5192                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5193                        ],
 5194                        1,
 5195                    )
 5196
 5197                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5198                log.debug("annotation_list: " + str(annotation_list))
 5199
 5200                # protocol
 5201                protocol = annotation
 5202
 5203                # argument
 5204                argument = ""
 5205
 5206                # operation
 5207                operation = "f"
 5208                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5209                    "ensGene"
 5210                ):
 5211                    operation = "g"
 5212                    if options.get("genebase", None):
 5213                        argument = f"""'{options.get("genebase","")}'"""
 5214                elif annotation in ["cytoBand"]:
 5215                    operation = "r"
 5216
 5217                # argument option
 5218                argument_option = ""
 5219                if argument != "":
 5220                    argument_option = " --argument " + argument
 5221
 5222                # command options
 5223                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5224                for option in options:
 5225                    if option not in ["genebase"]:
 5226                        command_options += f""" --{option}={options[option]}"""
 5227
 5228                # Command
 5229
 5230                # Command - Annovar
 5231                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5232                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5233
 5234                # Command - start pipe
 5235                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5236
 5237                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5238                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5239
 5240                # Command - Special characters (refGene annotation)
 5241                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5242
 5243                # Command - Clean empty fields (with value ".")
 5244                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5245
 5246                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5247                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5248                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5249                    # for ann in annotation_renamed_list:
 5250                    for ann in annotation_list:
 5251                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5252
 5253                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5254
 5255                # Command - indexing
 5256                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5257
 5258                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5259                run_parallel_commands([command_annovar], 1)
 5260
 5261                # Error messages
 5262                log.info(f"Error/Warning messages:")
 5263                error_message_command_all = []
 5264                error_message_command_warning = []
 5265                error_message_command_err = []
 5266                for err_file in err_files:
 5267                    with open(err_file, "r") as f:
 5268                        for line in f:
 5269                            message = line.strip()
 5270                            error_message_command_all.append(message)
 5271                            if line.startswith("[W::") or line.startswith("WARNING"):
 5272                                error_message_command_warning.append(message)
 5273                            if line.startswith("[E::") or line.startswith("ERROR"):
 5274                                error_message_command_err.append(
 5275                                    f"{err_file}: " + message
 5276                                )
 5277                # log info
 5278                for message in list(
 5279                    set(error_message_command_err + error_message_command_warning)
 5280                ):
 5281                    log.info(f"   {message}")
 5282                # debug info
 5283                for message in list(set(error_message_command_all)):
 5284                    log.debug(f"   {message}")
 5285                # failed
 5286                if len(error_message_command_err):
 5287                    log.error("Annotation failed: Error in commands")
 5288                    raise ValueError("Annotation failed: Error in commands")
 5289
 5290            if tmp_annotates_vcf_name_list:
 5291
 5292                # List of annotated files
 5293                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5294
 5295                # Tmp file
 5296                tmp_annotate_vcf = NamedTemporaryFile(
 5297                    prefix=self.get_prefix(),
 5298                    dir=self.get_tmp_dir(),
 5299                    suffix=".vcf.gz",
 5300                    delete=False,
 5301                )
 5302                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5303                tmp_files.append(tmp_annotate_vcf_name)
 5304                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5305                err_files.append(tmp_annotate_vcf_name_err)
 5306                tmp_files.append(tmp_annotate_vcf_name_err)
 5307
 5308                # Command merge
 5309                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5310                log.info(
 5311                    f"Annotation Annovar - Annotation merging "
 5312                    + str(len(tmp_annotates_vcf_name_list))
 5313                    + " annotated files"
 5314                )
 5315                log.debug(f"Annotation - merge command: {merge_command}")
 5316                run_parallel_commands([merge_command], 1)
 5317
 5318                # Find annotation in header
 5319                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5320                    header_list = self.read_vcf_header(f)
 5321                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5322
 5323                for ann in annovar_vcf_header.infos:
 5324                    if ann not in self.get_header().infos:
 5325                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5326
 5327                # Update variants
 5328                log.info(f"Annotation Annovar - Updating...")
 5329                self.update_from_vcf(tmp_annotate_vcf_name)
 5330
 5331            # Clean files
 5332            # Tmp file remove command
 5333            if True:
 5334                tmp_files_remove_command = ""
 5335                if tmp_files:
 5336                    tmp_files_remove_command = " ".join(tmp_files)
 5337                clean_command = f" rm -f {tmp_files_remove_command} "
 5338                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5339                log.debug(f"Annotation - cleaning command: {clean_command}")
 5340                run_parallel_commands([clean_command], 1)
 5341
 5342    # Parquet
 5343    def annotation_parquet(self, threads: int = None) -> None:
 5344        """
 5345        It takes a VCF file, and annotates it with a parquet file
 5346
 5347        :param threads: number of threads to use for the annotation
 5348        :return: the value of the variable "result".
 5349        """
 5350
 5351        # DEBUG
 5352        log.debug("Start annotation with parquet databases")
 5353
 5354        # Threads
 5355        if not threads:
 5356            threads = self.get_threads()
 5357        log.debug("Threads: " + str(threads))
 5358
 5359        # DEBUG
 5360        delete_tmp = True
 5361        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5362            delete_tmp = False
 5363            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5364
 5365        # Config
 5366        databases_folders = set(
 5367            self.get_config()
 5368            .get("folders", {})
 5369            .get("databases", {})
 5370            .get("annotations", ["."])
 5371            + self.get_config()
 5372            .get("folders", {})
 5373            .get("databases", {})
 5374            .get("parquet", ["."])
 5375        )
 5376        log.debug("Databases annotations: " + str(databases_folders))
 5377
 5378        # Param
 5379        annotations = (
 5380            self.get_param()
 5381            .get("annotation", {})
 5382            .get("parquet", {})
 5383            .get("annotations", None)
 5384        )
 5385        log.debug("Annotations: " + str(annotations))
 5386
 5387        # Assembly
 5388        assembly = self.get_param().get(
 5389            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5390        )
 5391
 5392        # Force Update Annotation
 5393        force_update_annotation = (
 5394            self.get_param()
 5395            .get("annotation", {})
 5396            .get("options", {})
 5397            .get("annotations_update", False)
 5398        )
 5399        log.debug(f"force_update_annotation={force_update_annotation}")
 5400        force_append_annotation = (
 5401            self.get_param()
 5402            .get("annotation", {})
 5403            .get("options", {})
 5404            .get("annotations_append", False)
 5405        )
 5406        log.debug(f"force_append_annotation={force_append_annotation}")
 5407
 5408        # Data
 5409        table_variants = self.get_table_variants()
 5410
 5411        # Check if not empty
 5412        log.debug("Check if not empty")
 5413        sql_query_chromosomes_df = self.get_query_to_df(
 5414            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5415        )
 5416        if not sql_query_chromosomes_df["count"][0]:
 5417            log.info(f"VCF empty")
 5418            return
 5419
 5420        # VCF header
 5421        vcf_reader = self.get_header()
 5422        log.debug("Initial header: " + str(vcf_reader.infos))
 5423
 5424        # Nb Variants POS
 5425        log.debug("NB Variants Start")
 5426        nb_variants = self.conn.execute(
 5427            f"SELECT count(*) AS count FROM variants"
 5428        ).fetchdf()["count"][0]
 5429        log.debug("NB Variants Stop")
 5430
 5431        # Existing annotations
 5432        for vcf_annotation in self.get_header().infos:
 5433
 5434            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5435            log.debug(
 5436                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5437            )
 5438
 5439        # Added columns
 5440        added_columns = []
 5441
 5442        # drop indexes
 5443        log.debug(f"Drop indexes...")
 5444        self.drop_indexes()
 5445
 5446        if annotations:
 5447
 5448            if "ALL" in annotations:
 5449
 5450                all_param = annotations.get("ALL", {})
 5451                all_param_formats = all_param.get("formats", None)
 5452                all_param_releases = all_param.get("releases", None)
 5453
 5454                databases_infos_dict = self.scan_databases(
 5455                    database_formats=all_param_formats,
 5456                    database_releases=all_param_releases,
 5457                )
 5458                for database_infos in databases_infos_dict.keys():
 5459                    if database_infos not in annotations:
 5460                        annotations[database_infos] = {"INFO": None}
 5461
 5462            for annotation in annotations:
 5463
 5464                if annotation in ["ALL"]:
 5465                    continue
 5466
 5467                # Annotation Name
 5468                annotation_name = os.path.basename(annotation)
 5469
 5470                # Annotation fields
 5471                annotation_fields = annotations[annotation]
 5472                if not annotation_fields:
 5473                    annotation_fields = {"INFO": None}
 5474
 5475                log.debug(f"Annotation '{annotation_name}'")
 5476                log.debug(
 5477                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5478                )
 5479
 5480                # Create Database
 5481                database = Database(
 5482                    database=annotation,
 5483                    databases_folders=databases_folders,
 5484                    assembly=assembly,
 5485                )
 5486
 5487                # Find files
 5488                parquet_file = database.get_database()
 5489                parquet_hdr_file = database.get_header_file()
 5490                parquet_type = database.get_type()
 5491
 5492                # Check if files exists
 5493                if not parquet_file or not parquet_hdr_file:
 5494                    log.error("Annotation failed: file not found")
 5495                    raise ValueError("Annotation failed: file not found")
 5496                else:
 5497                    # Get parquet connexion
 5498                    parquet_sql_attach = database.get_sql_database_attach(
 5499                        output="query"
 5500                    )
 5501                    if parquet_sql_attach:
 5502                        self.conn.execute(parquet_sql_attach)
 5503                    parquet_file_link = database.get_sql_database_link()
 5504                    # Log
 5505                    log.debug(
 5506                        f"Annotation '{annotation_name}' - file: "
 5507                        + str(parquet_file)
 5508                        + " and "
 5509                        + str(parquet_hdr_file)
 5510                    )
 5511
 5512                    # Database full header columns
 5513                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 5514                        parquet_hdr_file
 5515                    )
 5516                    # Log
 5517                    log.debug(
 5518                        "Annotation database header columns : "
 5519                        + str(parquet_hdr_vcf_header_columns)
 5520                    )
 5521
 5522                    # Load header as VCF object
 5523                    parquet_hdr_vcf_header_infos = database.get_header().infos
 5524                    # Log
 5525                    log.debug(
 5526                        "Annotation database header: "
 5527                        + str(parquet_hdr_vcf_header_infos)
 5528                    )
 5529
 5530                    # Get extra infos
 5531                    parquet_columns = database.get_extra_columns()
 5532                    # Log
 5533                    log.debug("Annotation database Columns: " + str(parquet_columns))
 5534
 5535                    # Add extra columns if "ALL" in annotation_fields
 5536                    # if "ALL" in annotation_fields:
 5537                    #     allow_add_extra_column = True
 5538                    if "ALL" in annotation_fields and database.get_extra_columns():
 5539                        for extra_column in database.get_extra_columns():
 5540                            if (
 5541                                extra_column not in annotation_fields
 5542                                and extra_column.replace("INFO/", "")
 5543                                not in parquet_hdr_vcf_header_infos
 5544                            ):
 5545                                parquet_hdr_vcf_header_infos[extra_column] = (
 5546                                    vcf.parser._Info(
 5547                                        extra_column,
 5548                                        ".",
 5549                                        "String",
 5550                                        f"{extra_column} description",
 5551                                        "unknown",
 5552                                        "unknown",
 5553                                        self.code_type_map["String"],
 5554                                    )
 5555                                )
 5556
 5557                    # For all fields in database
 5558                    annotation_fields_all = False
 5559                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 5560                        annotation_fields_all = True
 5561                        annotation_fields = {
 5562                            key: key for key in parquet_hdr_vcf_header_infos
 5563                        }
 5564
 5565                        log.debug(
 5566                            "Annotation database header - All annotations added: "
 5567                            + str(annotation_fields)
 5568                        )
 5569
 5570                    # Init
 5571
 5572                    # List of annotation fields to use
 5573                    sql_query_annotation_update_info_sets = []
 5574
 5575                    # List of annotation to agregate
 5576                    sql_query_annotation_to_agregate = []
 5577
 5578                    # Number of fields
 5579                    nb_annotation_field = 0
 5580
 5581                    # Annotation fields processed
 5582                    annotation_fields_processed = []
 5583
 5584                    # Columns mapping
 5585                    map_columns = database.map_columns(
 5586                        columns=annotation_fields, prefixes=["INFO/"]
 5587                    )
 5588
 5589                    # Query dict for fields to remove (update option)
 5590                    query_dict_remove = {}
 5591
 5592                    # Fetch Anotation fields
 5593                    for annotation_field in annotation_fields:
 5594
 5595                        # annotation_field_column
 5596                        annotation_field_column = map_columns.get(
 5597                            annotation_field, "INFO"
 5598                        )
 5599
 5600                        # field new name, if parametered
 5601                        annotation_fields_new_name = annotation_fields.get(
 5602                            annotation_field, annotation_field
 5603                        )
 5604                        if not annotation_fields_new_name:
 5605                            annotation_fields_new_name = annotation_field
 5606
 5607                        # To annotate
 5608                        # force_update_annotation = True
 5609                        # force_append_annotation = True
 5610                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 5611                        if annotation_field in parquet_hdr_vcf_header_infos and (
 5612                            force_update_annotation
 5613                            or force_append_annotation
 5614                            or (
 5615                                annotation_fields_new_name
 5616                                not in self.get_header().infos
 5617                            )
 5618                        ):
 5619
 5620                            # Add field to annotation to process list
 5621                            annotation_fields_processed.append(
 5622                                annotation_fields_new_name
 5623                            )
 5624
 5625                            # explode infos for the field
 5626                            annotation_fields_new_name_info_msg = ""
 5627                            if (
 5628                                force_update_annotation
 5629                                and annotation_fields_new_name
 5630                                in self.get_header().infos
 5631                            ):
 5632                                # Remove field from INFO
 5633                                query = f"""
 5634                                    UPDATE {table_variants} as table_variants
 5635                                    SET INFO = REGEXP_REPLACE(
 5636                                                concat(table_variants.INFO,''),
 5637                                                ';*{annotation_fields_new_name}=[^;]*',
 5638                                                ''
 5639                                                )
 5640                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 5641                                """
 5642                                annotation_fields_new_name_info_msg = " [update]"
 5643                                query_dict_remove[
 5644                                    f"remove 'INFO/{annotation_fields_new_name}'"
 5645                                ] = query
 5646
 5647                            # Sep between fields in INFO
 5648                            nb_annotation_field += 1
 5649                            if nb_annotation_field > 1:
 5650                                annotation_field_sep = ";"
 5651                            else:
 5652                                annotation_field_sep = ""
 5653
 5654                            log.info(
 5655                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 5656                            )
 5657
 5658                            # Add INFO field to header
 5659                            parquet_hdr_vcf_header_infos_number = (
 5660                                parquet_hdr_vcf_header_infos[annotation_field].num
 5661                                or "."
 5662                            )
 5663                            parquet_hdr_vcf_header_infos_type = (
 5664                                parquet_hdr_vcf_header_infos[annotation_field].type
 5665                                or "String"
 5666                            )
 5667                            parquet_hdr_vcf_header_infos_description = (
 5668                                parquet_hdr_vcf_header_infos[annotation_field].desc
 5669                                or f"{annotation_field} description"
 5670                            )
 5671                            parquet_hdr_vcf_header_infos_source = (
 5672                                parquet_hdr_vcf_header_infos[annotation_field].source
 5673                                or "unknown"
 5674                            )
 5675                            parquet_hdr_vcf_header_infos_version = (
 5676                                parquet_hdr_vcf_header_infos[annotation_field].version
 5677                                or "unknown"
 5678                            )
 5679
 5680                            vcf_reader.infos[annotation_fields_new_name] = (
 5681                                vcf.parser._Info(
 5682                                    annotation_fields_new_name,
 5683                                    parquet_hdr_vcf_header_infos_number,
 5684                                    parquet_hdr_vcf_header_infos_type,
 5685                                    parquet_hdr_vcf_header_infos_description,
 5686                                    parquet_hdr_vcf_header_infos_source,
 5687                                    parquet_hdr_vcf_header_infos_version,
 5688                                    self.code_type_map[
 5689                                        parquet_hdr_vcf_header_infos_type
 5690                                    ],
 5691                                )
 5692                            )
 5693
 5694                            # Append
 5695                            if force_append_annotation:
 5696                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 5697                            else:
 5698                                query_case_when_append = ""
 5699
 5700                            # Annotation/Update query fields
 5701                            # Found in INFO column
 5702                            if (
 5703                                annotation_field_column == "INFO"
 5704                                and "INFO" in parquet_hdr_vcf_header_columns
 5705                            ):
 5706                                sql_query_annotation_update_info_sets.append(
 5707                                    f"""
 5708                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 5709                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 5710                                        ELSE ''
 5711                                    END
 5712                                """
 5713                                )
 5714                            # Found in a specific column
 5715                            else:
 5716                                sql_query_annotation_update_info_sets.append(
 5717                                    f"""
 5718                                CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
 5719                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ','))
 5720                                        ELSE ''
 5721                                    END
 5722                                """
 5723                                )
 5724                                sql_query_annotation_to_agregate.append(
 5725                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 5726                                )
 5727
 5728                        # Not to annotate
 5729                        else:
 5730
 5731                            if force_update_annotation:
 5732                                annotation_message = "forced"
 5733                            else:
 5734                                annotation_message = "skipped"
 5735
 5736                            if annotation_field not in parquet_hdr_vcf_header_infos:
 5737                                log.warning(
 5738                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 5739                                )
 5740                            if annotation_fields_new_name in self.get_header().infos:
 5741                                log.warning(
 5742                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 5743                                )
 5744
 5745                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 5746                    # allow_annotation_full_info = True
 5747                    allow_annotation_full_info = not force_append_annotation
 5748
 5749                    if parquet_type in ["regions"]:
 5750                        allow_annotation_full_info = False
 5751
 5752                    if (
 5753                        allow_annotation_full_info
 5754                        and nb_annotation_field == len(annotation_fields)
 5755                        and annotation_fields_all
 5756                        and (
 5757                            "INFO" in parquet_hdr_vcf_header_columns
 5758                            and "INFO" in database.get_extra_columns()
 5759                        )
 5760                    ):
 5761                        log.debug("Column INFO annotation enabled")
 5762                        sql_query_annotation_update_info_sets = []
 5763                        sql_query_annotation_update_info_sets.append(
 5764                            f" table_parquet.INFO "
 5765                        )
 5766
 5767                    if sql_query_annotation_update_info_sets:
 5768
 5769                        # Annotate
 5770                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 5771
 5772                        # Join query annotation update info sets for SQL
 5773                        sql_query_annotation_update_info_sets_sql = ",".join(
 5774                            sql_query_annotation_update_info_sets
 5775                        )
 5776
 5777                        # Check chromosomes list (and variants infos)
 5778                        sql_query_chromosomes = f"""
 5779                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 5780                            FROM {table_variants} as table_variants
 5781                            GROUP BY table_variants."#CHROM"
 5782                            ORDER BY table_variants."#CHROM"
 5783                            """
 5784                        sql_query_chromosomes_df = self.conn.execute(
 5785                            sql_query_chromosomes
 5786                        ).df()
 5787                        sql_query_chromosomes_dict = {
 5788                            entry["CHROM"]: {
 5789                                "count": entry["count_variants"],
 5790                                "min": entry["min_variants"],
 5791                                "max": entry["max_variants"],
 5792                            }
 5793                            for index, entry in sql_query_chromosomes_df.iterrows()
 5794                        }
 5795
 5796                        # Init
 5797                        nb_of_query = 0
 5798                        nb_of_variant_annotated = 0
 5799                        query_dict = query_dict_remove
 5800
 5801                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 5802                        for chrom in sql_query_chromosomes_dict:
 5803
 5804                            # Number of variant by chromosome
 5805                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 5806                                chrom, {}
 5807                            ).get("count", 0)
 5808
 5809                            log.debug(
 5810                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 5811                            )
 5812
 5813                            # Annotation with regions database
 5814                            if parquet_type in ["regions"]:
 5815                                sql_query_annotation_from_clause = f"""
 5816                                    FROM (
 5817                                        SELECT 
 5818                                            '{chrom}' AS \"#CHROM\",
 5819                                            table_variants_from.\"POS\" AS \"POS\",
 5820                                            {",".join(sql_query_annotation_to_agregate)}
 5821                                        FROM {table_variants} as table_variants_from
 5822                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 5823                                            table_parquet_from."#CHROM" = '{chrom}'
 5824                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 5825                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
 5826                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 5827                                                )
 5828                                        )
 5829                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 5830                                        GROUP BY table_variants_from.\"POS\"
 5831                                        )
 5832                                        as table_parquet
 5833                                """
 5834
 5835                                sql_query_annotation_where_clause = """
 5836                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 5837                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5838                                """
 5839
 5840                            # Annotation with variants database
 5841                            else:
 5842                                sql_query_annotation_from_clause = f"""
 5843                                    FROM {parquet_file_link} as table_parquet
 5844                                """
 5845                                sql_query_annotation_where_clause = f"""
 5846                                    table_variants."#CHROM" = '{chrom}'
 5847                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 5848                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5849                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5850                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5851                                """
 5852
 5853                            # Create update query
 5854                            sql_query_annotation_chrom_interval_pos = f"""
 5855                                UPDATE {table_variants} as table_variants
 5856                                    SET INFO = 
 5857                                        concat(
 5858                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5859                                                THEN table_variants.INFO
 5860                                                ELSE ''
 5861                                            END
 5862                                            ,
 5863                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5864                                                        AND (
 5865                                                        concat({sql_query_annotation_update_info_sets_sql})
 5866                                                        )
 5867                                                        NOT IN ('','.') 
 5868                                                    THEN ';'
 5869                                                    ELSE ''
 5870                                            END
 5871                                            ,
 5872                                            {sql_query_annotation_update_info_sets_sql}
 5873                                            )
 5874                                    {sql_query_annotation_from_clause}
 5875                                    WHERE {sql_query_annotation_where_clause}
 5876                                    ;
 5877                                """
 5878
 5879                            # Add update query to dict
 5880                            query_dict[
 5881                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 5882                            ] = sql_query_annotation_chrom_interval_pos
 5883
 5884                        nb_of_query = len(query_dict)
 5885                        num_query = 0
 5886
 5887                        # SET max_expression_depth TO x
 5888                        self.conn.execute("SET max_expression_depth TO 10000")
 5889
 5890                        for query_name in query_dict:
 5891                            query = query_dict[query_name]
 5892                            num_query += 1
 5893                            log.info(
 5894                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 5895                            )
 5896                            result = self.conn.execute(query)
 5897                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 5898                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 5899                            log.info(
 5900                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 5901                            )
 5902
 5903                        log.info(
 5904                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 5905                        )
 5906
 5907                    else:
 5908
 5909                        log.info(
 5910                            f"Annotation '{annotation_name}' - No Annotations available"
 5911                        )
 5912
 5913                    log.debug("Final header: " + str(vcf_reader.infos))
 5914
 5915        # Remove added columns
 5916        for added_column in added_columns:
 5917            self.drop_column(column=added_column)
 5918
 5919    def annotation_splice(self, threads: int = None) -> None:
 5920        """
 5921        This function annotate with snpEff
 5922
 5923        :param threads: The number of threads to use
 5924        :return: the value of the variable "return_value".
 5925        """
 5926
 5927        # DEBUG
 5928        log.debug("Start annotation with splice tools")
 5929
 5930        # Threads
 5931        if not threads:
 5932            threads = self.get_threads()
 5933        log.debug("Threads: " + str(threads))
 5934
 5935        # DEBUG
 5936        delete_tmp = True
 5937        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5938            delete_tmp = False
 5939            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5940
 5941        # Config
 5942        config = self.get_config()
 5943        log.debug("Config: " + str(config))
 5944        splice_config = config.get("tools", {}).get("splice", {})
 5945        if not splice_config:
 5946            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 5947        if not splice_config:
 5948            msg_err = "No Splice tool config"
 5949            log.error(msg_err)
 5950            raise ValueError(msg_err)
 5951        log.debug(f"splice_config={splice_config}")
 5952
 5953        # Config - Folders - Databases
 5954        databases_folders = (
 5955            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 5956        )
 5957        log.debug("Databases annotations: " + str(databases_folders))
 5958
 5959        # Splice docker image
 5960        splice_docker_image = splice_config.get("docker").get("image")
 5961
 5962        # Pull splice image if it's not already there
 5963        if not check_docker_image_exists(splice_docker_image):
 5964            log.warning(
 5965                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 5966            )
 5967            try:
 5968                command(f"docker pull {splice_config.get('docker').get('image')}")
 5969            except subprocess.CalledProcessError:
 5970                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 5971                log.error(msg_err)
 5972                raise ValueError(msg_err)
 5973                return None
 5974
 5975        # Config - splice databases
 5976        splice_databases = (
 5977            config.get("folders", {})
 5978            .get("databases", {})
 5979            .get("splice", DEFAULT_SPLICE_FOLDER)
 5980        )
 5981        splice_databases = full_path(splice_databases)
 5982
 5983        # Param
 5984        param = self.get_param()
 5985        log.debug("Param: " + str(param))
 5986
 5987        # Param
 5988        options = param.get("annotation", {}).get("splice", {})
 5989        log.debug("Options: " + str(options))
 5990
 5991        # Data
 5992        table_variants = self.get_table_variants()
 5993
 5994        # Check if not empty
 5995        log.debug("Check if not empty")
 5996        sql_query_chromosomes = (
 5997            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5998        )
 5999        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6000            log.info("VCF empty")
 6001            return None
 6002
 6003        # Export in VCF
 6004        log.debug("Create initial file to annotate")
 6005
 6006        # Create output folder
 6007        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6008        if not os.path.exists(output_folder):
 6009            Path(output_folder).mkdir(parents=True, exist_ok=True)
 6010
 6011        # Create tmp VCF file
 6012        tmp_vcf = NamedTemporaryFile(
 6013            prefix=self.get_prefix(),
 6014            dir=output_folder,
 6015            suffix=".vcf",
 6016            delete=False,
 6017        )
 6018        tmp_vcf_name = tmp_vcf.name
 6019
 6020        # VCF header
 6021        header = self.get_header()
 6022
 6023        # Existing annotations
 6024        for vcf_annotation in self.get_header().infos:
 6025
 6026            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6027            log.debug(
 6028                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6029            )
 6030
 6031        # Memory limit
 6032        if config.get("memory", None):
 6033            memory_limit = config.get("memory", "8G").upper()
 6034            # upper()
 6035        else:
 6036            memory_limit = "8G"
 6037        log.debug(f"memory_limit: {memory_limit}")
 6038
 6039        # Check number of variants to annotate
 6040        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6041        where_clause_regex_spip = r"SPiP_\w+"
 6042        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6043        df_list_of_variants_to_annotate = self.get_query_to_df(
 6044            query=f""" SELECT * FROM variants {where_clause} """
 6045        )
 6046        if len(df_list_of_variants_to_annotate) == 0:
 6047            log.warning(
 6048                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6049            )
 6050            return None
 6051        else:
 6052            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6053
 6054        # Export VCF file
 6055        self.export_variant_vcf(
 6056            vcf_file=tmp_vcf_name,
 6057            remove_info=True,
 6058            add_samples=True,
 6059            index=False,
 6060            where_clause=where_clause,
 6061        )
 6062
 6063        # Create docker container and launch splice analysis
 6064        if splice_config:
 6065
 6066            # Splice mount folders
 6067            mount_folders = splice_config.get("mount", {})
 6068
 6069            # Genome mount
 6070            mount_folders[
 6071                config.get("folders", {})
 6072                .get("databases", {})
 6073                .get("genomes", DEFAULT_GENOME_FOLDER)
 6074            ] = "ro"
 6075
 6076            # SpliceAI mount
 6077            mount_folders[
 6078                config.get("folders", {})
 6079                .get("databases", {})
 6080                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
 6081            ] = "ro"
 6082
 6083            # Genome mount
 6084            mount_folders[
 6085                config.get("folders", {})
 6086                .get("databases", {})
 6087                .get("spip", DEFAULT_SPIP_FOLDER)
 6088            ] = "ro"
 6089
 6090            # Mount folders
 6091            mount = []
 6092
 6093            # Config mount
 6094            mount = [
 6095                f"-v {full_path(path)}:{full_path(path)}:{mode}"
 6096                for path, mode in mount_folders.items()
 6097            ]
 6098
 6099            if any(value for value in splice_config.values() if value is None):
 6100                log.warning("At least one splice config parameter is empty")
 6101                return None
 6102
 6103            # Params in splice nf
 6104            def check_values(dico: dict):
 6105                """
 6106                Ensure parameters for NF splice pipeline
 6107                """
 6108                for key, val in dico.items():
 6109                    if key == "genome":
 6110                        if any(
 6111                            assemb in options.get("genome", {})
 6112                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6113                        ):
 6114                            yield f"--{key} hg19"
 6115                        elif any(
 6116                            assemb in options.get("genome", {})
 6117                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6118                        ):
 6119                            yield f"--{key} hg38"
 6120                    elif (
 6121                        (isinstance(val, str) and val)
 6122                        or isinstance(val, int)
 6123                        or isinstance(val, bool)
 6124                    ):
 6125                        yield f"--{key} {val}"
 6126
 6127            # Genome
 6128            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6129            options["genome"] = genome
 6130
 6131            # NF params
 6132            nf_params = []
 6133
 6134            # Add options
 6135            if options:
 6136                nf_params = list(check_values(options))
 6137                log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6138            else:
 6139                log.debug("No NF params provided")
 6140
 6141            # Add threads
 6142            if "threads" not in options.keys():
 6143                nf_params.append(f"--threads {threads}")
 6144
 6145            # Genome path
 6146            genome_path = find_genome(
 6147                config.get("folders", {})
 6148                .get("databases", {})
 6149                .get("genomes", DEFAULT_GENOME_FOLDER),
 6150                file=f"{genome}.fa",
 6151            )
 6152            # Add genome path
 6153            if not genome_path:
 6154                raise ValueError(
 6155                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6156                )
 6157            else:
 6158                log.debug(f"Genome: {genome_path}")
 6159                nf_params.append(f"--genome_path {genome_path}")
 6160
 6161            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6162                """
 6163                Setting up updated databases for SPiP and SpliceAI
 6164                """
 6165
 6166                try:
 6167
 6168                    # SpliceAI assembly transcriptome
 6169                    spliceai_assembly = os.path.join(
 6170                        config.get("folders", {})
 6171                        .get("databases", {})
 6172                        .get("spliceai", {}),
 6173                        options.get("genome"),
 6174                        "transcriptome",
 6175                    )
 6176                    spip_assembly = options.get("genome")
 6177
 6178                    spip = find(
 6179                        f"transcriptome_{spip_assembly}.RData",
 6180                        config.get("folders", {}).get("databases", {}).get("spip", {}),
 6181                    )
 6182                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6183                    log.debug(f"SPiP annotations: {spip}")
 6184                    log.debug(f"SpliceAI annotations: {spliceai}")
 6185                    if spip and spliceai:
 6186                        return [
 6187                            f"--spip_transcriptome {spip}",
 6188                            f"--spliceai_annotations {spliceai}",
 6189                        ]
 6190                    else:
 6191                        # TODO crash and go on with basic annotations ?
 6192                        # raise ValueError(
 6193                        #     "Can't find splice databases in configuration EXIT"
 6194                        # )
 6195                        log.warning(
 6196                            "Can't find splice databases in configuration, use annotations file from image"
 6197                        )
 6198                except TypeError:
 6199                    log.warning(
 6200                        "Can't find splice databases in configuration, use annotations file from image"
 6201                    )
 6202                    return []
 6203
 6204            # Add options, check if transcriptome option have already beend provided
 6205            if (
 6206                "spip_transcriptome" not in nf_params
 6207                and "spliceai_transcriptome" not in nf_params
 6208            ):
 6209                splice_reference = splice_annotations(options, config)
 6210                if splice_reference:
 6211                    nf_params.extend(splice_reference)
 6212
 6213            nf_params.append(f"--output_folder {output_folder}")
 6214
 6215            random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6216            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6217            log.debug(cmd)
 6218
 6219            splice_config["docker"]["command"] = cmd
 6220
 6221            docker_cmd = get_bin_command(
 6222                tool="splice",
 6223                bin_type="docker",
 6224                config=config,
 6225                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6226                add_options=f"--name {random_uuid} {' '.join(mount)}",
 6227            )
 6228
 6229            # Docker debug
 6230            # if splice_config.get("rm_container"):
 6231            #     rm_container = "--rm"
 6232            # else:
 6233            #     rm_container = ""
 6234            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6235
 6236            log.debug(docker_cmd)
 6237            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6238            log.debug(res.stdout)
 6239            if res.stderr:
 6240                log.error(res.stderr)
 6241            res.check_returncode()
 6242        else:
 6243            log.warning(f"Splice tool configuration not found: {config}")
 6244
 6245        # Update variants
 6246        log.info("Annotation - Updating...")
 6247        # Test find output vcf
 6248        log.debug(
 6249            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6250        )
 6251        output_vcf = []
 6252        # Wrong folder to look in
 6253        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6254            if (
 6255                files
 6256                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6257            ):
 6258                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6259        # log.debug(os.listdir(options.get("output_folder")))
 6260        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6261        if not output_vcf:
 6262            log.debug(
 6263                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6264            )
 6265        else:
 6266            # Get new header from annotated vcf
 6267            log.debug(f"Initial header: {len(header.infos)} fields")
 6268            # Create new header with splice infos
 6269            new_vcf = Variants(input=output_vcf[0])
 6270            new_vcf_header = new_vcf.get_header().infos
 6271            for keys, infos in new_vcf_header.items():
 6272                if keys not in header.infos.keys():
 6273                    header.infos[keys] = infos
 6274            log.debug(f"New header: {len(header.infos)} fields")
 6275            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6276            self.update_from_vcf(output_vcf[0])
 6277
 6278        # Remove folder
 6279        remove_if_exists(output_folder)
 6280
 6281    ###
 6282    # Prioritization
 6283    ###
 6284
 6285    def get_config_default(self, name: str) -> dict:
 6286        """
 6287        The function `get_config_default` returns a dictionary containing default configurations for
 6288        various calculations and prioritizations.
 6289
 6290        :param name: The `get_config_default` function returns a dictionary containing default
 6291        configurations for different calculations and prioritizations. The `name` parameter is used to
 6292        specify which specific configuration to retrieve from the dictionary
 6293        :type name: str
 6294        :return: The function `get_config_default` returns a dictionary containing default configuration
 6295        settings for different calculations and prioritizations. The specific configuration settings are
 6296        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6297        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6298        returned. If there is no match, an empty dictionary is returned.
 6299        """
 6300
 6301        config_default = {
 6302            "calculations": {
 6303                "variant_chr_pos_alt_ref": {
 6304                    "type": "sql",
 6305                    "name": "variant_chr_pos_alt_ref",
 6306                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6307                    "available": False,
 6308                    "output_column_name": "variant_chr_pos_alt_ref",
 6309                    "output_column_type": "String",
 6310                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6311                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6312                    "operation_info": True,
 6313                },
 6314                "VARTYPE": {
 6315                    "type": "sql",
 6316                    "name": "VARTYPE",
 6317                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6318                    "available": True,
 6319                    "output_column_name": "VARTYPE",
 6320                    "output_column_type": "String",
 6321                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6322                    "operation_query": """
 6323                            CASE
 6324                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6325                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6326                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6327                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6328                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6329                                ELSE 'UNDEFINED'
 6330                            END
 6331                            """,
 6332                    "info_fields": ["SVTYPE"],
 6333                    "operation_info": True,
 6334                },
 6335                "snpeff_hgvs": {
 6336                    "type": "python",
 6337                    "name": "snpeff_hgvs",
 6338                    "description": "HGVS nomenclatures from snpEff annotation",
 6339                    "available": True,
 6340                    "function_name": "calculation_extract_snpeff_hgvs",
 6341                    "function_params": ["snpeff_hgvs", "ANN"],
 6342                },
 6343                "snpeff_ann_explode": {
 6344                    "type": "python",
 6345                    "name": "snpeff_ann_explode",
 6346                    "description": "Explode snpEff annotations with uniquify values",
 6347                    "available": True,
 6348                    "function_name": "calculation_snpeff_ann_explode",
 6349                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6350                },
 6351                "snpeff_ann_explode_uniquify": {
 6352                    "type": "python",
 6353                    "name": "snpeff_ann_explode_uniquify",
 6354                    "description": "Explode snpEff annotations",
 6355                    "available": True,
 6356                    "function_name": "calculation_snpeff_ann_explode",
 6357                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6358                },
 6359                "snpeff_ann_explode_json": {
 6360                    "type": "python",
 6361                    "name": "snpeff_ann_explode_json",
 6362                    "description": "Explode snpEff annotations in JSON format",
 6363                    "available": True,
 6364                    "function_name": "calculation_snpeff_ann_explode",
 6365                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6366                },
 6367                "NOMEN": {
 6368                    "type": "python",
 6369                    "name": "NOMEN",
 6370                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
 6371                    "available": True,
 6372                    "function_name": "calculation_extract_nomen",
 6373                    "function_params": [],
 6374                },
 6375                "FINDBYPIPELINE": {
 6376                    "type": "python",
 6377                    "name": "FINDBYPIPELINE",
 6378                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6379                    "available": True,
 6380                    "function_name": "calculation_find_by_pipeline",
 6381                    "function_params": ["findbypipeline"],
 6382                },
 6383                "FINDBYSAMPLE": {
 6384                    "type": "python",
 6385                    "name": "FINDBYSAMPLE",
 6386                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6387                    "available": True,
 6388                    "function_name": "calculation_find_by_pipeline",
 6389                    "function_params": ["findbysample"],
 6390                },
 6391                "GENOTYPECONCORDANCE": {
 6392                    "type": "python",
 6393                    "name": "GENOTYPECONCORDANCE",
 6394                    "description": "Concordance of genotype for multi caller VCF",
 6395                    "available": True,
 6396                    "function_name": "calculation_genotype_concordance",
 6397                    "function_params": [],
 6398                },
 6399                "BARCODE": {
 6400                    "type": "python",
 6401                    "name": "BARCODE",
 6402                    "description": "BARCODE as VaRank tool",
 6403                    "available": True,
 6404                    "function_name": "calculation_barcode",
 6405                    "function_params": [],
 6406                },
 6407                "BARCODEFAMILY": {
 6408                    "type": "python",
 6409                    "name": "BARCODEFAMILY",
 6410                    "description": "BARCODEFAMILY as VaRank tool",
 6411                    "available": True,
 6412                    "function_name": "calculation_barcode_family",
 6413                    "function_params": ["BCF"],
 6414                },
 6415                "TRIO": {
 6416                    "type": "python",
 6417                    "name": "TRIO",
 6418                    "description": "Inheritance for a trio family",
 6419                    "available": True,
 6420                    "function_name": "calculation_trio",
 6421                    "function_params": [],
 6422                },
 6423                "VAF": {
 6424                    "type": "python",
 6425                    "name": "VAF",
 6426                    "description": "Variant Allele Frequency (VAF) harmonization",
 6427                    "available": True,
 6428                    "function_name": "calculation_vaf_normalization",
 6429                    "function_params": [],
 6430                },
 6431                "VAF_stats": {
 6432                    "type": "python",
 6433                    "name": "VAF_stats",
 6434                    "description": "Variant Allele Frequency (VAF) statistics",
 6435                    "available": True,
 6436                    "function_name": "calculation_genotype_stats",
 6437                    "function_params": ["VAF"],
 6438                },
 6439                "DP_stats": {
 6440                    "type": "python",
 6441                    "name": "DP_stats",
 6442                    "description": "Depth (DP) statistics",
 6443                    "available": True,
 6444                    "function_name": "calculation_genotype_stats",
 6445                    "function_params": ["DP"],
 6446                },
 6447                "variant_id": {
 6448                    "type": "python",
 6449                    "name": "variant_id",
 6450                    "description": "Variant ID generated from variant position and type",
 6451                    "available": True,
 6452                    "function_name": "calculation_variant_id",
 6453                    "function_params": [],
 6454                },
 6455                "transcripts_json": {
 6456                    "type": "python",
 6457                    "name": "transcripts_json",
 6458                    "description": "Add transcripts info in JSON format (field 'transcripts_json')",
 6459                    "available": True,
 6460                    "function_name": "calculation_transcripts_json",
 6461                    "function_params": ["transcripts_json"],
 6462                },
 6463                "transcripts_prioritization": {
 6464                    "type": "python",
 6465                    "name": "transcripts_prioritization",
 6466                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6467                    "available": True,
 6468                    "function_name": "calculation_transcripts_prioritization",
 6469                    "function_params": [],
 6470                },
 6471            },
 6472            "prioritizations": {
 6473                "default": {
 6474                    "filter": [
 6475                        {
 6476                            "type": "notequals",
 6477                            "value": "!PASS|\\.",
 6478                            "score": 0,
 6479                            "flag": "FILTERED",
 6480                            "comment": ["Bad variant quality"],
 6481                        },
 6482                        {
 6483                            "type": "equals",
 6484                            "value": "REJECT",
 6485                            "score": -20,
 6486                            "flag": "PASS",
 6487                            "comment": ["Bad variant quality"],
 6488                        },
 6489                    ],
 6490                    "DP": [
 6491                        {
 6492                            "type": "gte",
 6493                            "value": "50",
 6494                            "score": 5,
 6495                            "flag": "PASS",
 6496                            "comment": ["DP higher than 50"],
 6497                        }
 6498                    ],
 6499                    "ANN": [
 6500                        {
 6501                            "type": "contains",
 6502                            "value": "HIGH",
 6503                            "score": 5,
 6504                            "flag": "PASS",
 6505                            "comment": [
 6506                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6507                            ],
 6508                        },
 6509                        {
 6510                            "type": "contains",
 6511                            "value": "MODERATE",
 6512                            "score": 3,
 6513                            "flag": "PASS",
 6514                            "comment": [
 6515                                "A non-disruptive variant that might change protein effectiveness"
 6516                            ],
 6517                        },
 6518                        {
 6519                            "type": "contains",
 6520                            "value": "LOW",
 6521                            "score": 0,
 6522                            "flag": "FILTERED",
 6523                            "comment": [
 6524                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 6525                            ],
 6526                        },
 6527                        {
 6528                            "type": "contains",
 6529                            "value": "MODIFIER",
 6530                            "score": 0,
 6531                            "flag": "FILTERED",
 6532                            "comment": [
 6533                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 6534                            ],
 6535                        },
 6536                    ],
 6537                }
 6538            },
 6539        }
 6540
 6541        return config_default.get(name, None)
 6542
 6543    def get_config_json(
 6544        self, name: str, config_dict: dict = {}, config_file: str = None
 6545    ) -> dict:
 6546        """
 6547        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 6548        default values, a dictionary, and a file.
 6549
 6550        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 6551        the name of the configuration. It is used to identify and retrieve the configuration settings
 6552        for a specific component or module
 6553        :type name: str
 6554        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 6555        dictionary that allows you to provide additional configuration settings or overrides. When you
 6556        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 6557        the key is the configuration setting you want to override or
 6558        :type config_dict: dict
 6559        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 6560        specify the path to a configuration file that contains additional settings. If provided, the
 6561        function will read the contents of this file and update the configuration dictionary with the
 6562        values found in the file, overriding any existing values with the
 6563        :type config_file: str
 6564        :return: The function `get_config_json` returns a dictionary containing the configuration
 6565        settings.
 6566        """
 6567
 6568        # Create with default prioritizations
 6569        config_default = self.get_config_default(name=name)
 6570        configuration = config_default
 6571        # log.debug(f"configuration={configuration}")
 6572
 6573        # Replace prioritizations from dict
 6574        for config in config_dict:
 6575            configuration[config] = config_dict[config]
 6576
 6577        # Replace prioritizations from file
 6578        config_file = full_path(config_file)
 6579        if config_file:
 6580            if os.path.exists(config_file):
 6581                with open(config_file) as config_file_content:
 6582                    config_file_dict = json.load(config_file_content)
 6583                for config in config_file_dict:
 6584                    configuration[config] = config_file_dict[config]
 6585            else:
 6586                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 6587                log.error(msg_error)
 6588                raise ValueError(msg_error)
 6589
 6590        return configuration
 6591
 6592    def prioritization(
 6593        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 6594    ) -> bool:
 6595        """
 6596        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 6597        prioritizes variants based on configured profiles and criteria.
 6598
 6599        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 6600        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 6601        a table name is provided, the method will prioritize the variants in that specific table
 6602        :type table: str
 6603        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 6604        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 6605        provided, the code will use a default prefix value of "PZ"
 6606        :type pz_prefix: str
 6607        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 6608        additional parameters specific to the prioritization process. These parameters can include
 6609        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 6610        configurations needed for the prioritization of variants in a V
 6611        :type pz_param: dict
 6612        :return: A boolean value (True) is being returned from the `prioritization` function.
 6613        """
 6614
 6615        # Config
 6616        config = self.get_config()
 6617
 6618        # Param
 6619        param = self.get_param()
 6620
 6621        # Prioritization param
 6622        if pz_param is not None:
 6623            prioritization_param = pz_param
 6624        else:
 6625            prioritization_param = param.get("prioritization", {})
 6626
 6627        # Configuration profiles
 6628        prioritization_config_file = prioritization_param.get(
 6629            "prioritization_config", None
 6630        )
 6631        prioritization_config_file = full_path(prioritization_config_file)
 6632        prioritizations_config = self.get_config_json(
 6633            name="prioritizations", config_file=prioritization_config_file
 6634        )
 6635
 6636        # Prioritization prefix
 6637        pz_prefix_default = "PZ"
 6638        if pz_prefix is None:
 6639            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 6640
 6641        # Prioritization options
 6642        profiles = prioritization_param.get("profiles", [])
 6643        if isinstance(profiles, str):
 6644            profiles = profiles.split(",")
 6645        pzfields = prioritization_param.get(
 6646            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 6647        )
 6648        if isinstance(pzfields, str):
 6649            pzfields = pzfields.split(",")
 6650        default_profile = prioritization_param.get("default_profile", None)
 6651        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 6652        prioritization_score_mode = prioritization_param.get(
 6653            "prioritization_score_mode", "HOWARD"
 6654        )
 6655
 6656        # Quick Prioritizations
 6657        prioritizations = param.get("prioritizations", None)
 6658        if prioritizations:
 6659            log.info("Quick Prioritization:")
 6660            for profile in prioritizations.split(","):
 6661                if profile not in profiles:
 6662                    profiles.append(profile)
 6663                    log.info(f"   {profile}")
 6664
 6665        # If profile "ALL" provided, all profiles in the config profiles
 6666        if "ALL" in profiles:
 6667            profiles = list(prioritizations_config.keys())
 6668
 6669        for profile in profiles:
 6670            if prioritizations_config.get(profile, None):
 6671                log.debug(f"Profile '{profile}' configured")
 6672            else:
 6673                msg_error = f"Profile '{profile}' NOT configured"
 6674                log.error(msg_error)
 6675                raise ValueError(msg_error)
 6676
 6677        if profiles:
 6678            log.info(f"Prioritization... ")
 6679        else:
 6680            log.debug(f"No profile defined")
 6681            return False
 6682
 6683        if not default_profile and len(profiles):
 6684            default_profile = profiles[0]
 6685
 6686        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 6687        log.debug("Profiles to check: " + str(list(profiles)))
 6688
 6689        # Variables
 6690        if table is not None:
 6691            table_variants = table
 6692        else:
 6693            table_variants = self.get_table_variants(clause="update")
 6694        log.debug(f"Table to prioritize: {table_variants}")
 6695
 6696        # Added columns
 6697        added_columns = []
 6698
 6699        # Create list of PZfields
 6700        # List of PZFields
 6701        list_of_pzfields_original = pzfields + [
 6702            pzfield + pzfields_sep + profile
 6703            for pzfield in pzfields
 6704            for profile in profiles
 6705        ]
 6706        list_of_pzfields = []
 6707        log.debug(f"{list_of_pzfields_original}")
 6708
 6709        # Remove existing PZfields to use if exists
 6710        for pzfield in list_of_pzfields_original:
 6711            if self.get_header().infos.get(pzfield, None) is None:
 6712                list_of_pzfields.append(pzfield)
 6713                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 6714            else:
 6715                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 6716
 6717        if list_of_pzfields:
 6718
 6719            # Explode Infos prefix
 6720            explode_infos_prefix = self.get_explode_infos_prefix()
 6721
 6722            # PZfields tags description
 6723            PZfields_INFOS = {
 6724                f"{pz_prefix}Tags": {
 6725                    "ID": f"{pz_prefix}Tags",
 6726                    "Number": ".",
 6727                    "Type": "String",
 6728                    "Description": "Variant tags based on annotation criteria",
 6729                },
 6730                f"{pz_prefix}Score": {
 6731                    "ID": f"{pz_prefix}Score",
 6732                    "Number": 1,
 6733                    "Type": "Integer",
 6734                    "Description": "Variant score based on annotation criteria",
 6735                },
 6736                f"{pz_prefix}Flag": {
 6737                    "ID": f"{pz_prefix}Flag",
 6738                    "Number": 1,
 6739                    "Type": "String",
 6740                    "Description": "Variant flag based on annotation criteria",
 6741                },
 6742                f"{pz_prefix}Comment": {
 6743                    "ID": f"{pz_prefix}Comment",
 6744                    "Number": ".",
 6745                    "Type": "String",
 6746                    "Description": "Variant comment based on annotation criteria",
 6747                },
 6748                f"{pz_prefix}Infos": {
 6749                    "ID": f"{pz_prefix}Infos",
 6750                    "Number": ".",
 6751                    "Type": "String",
 6752                    "Description": "Variant infos based on annotation criteria",
 6753                },
 6754            }
 6755
 6756            # Create INFO fields if not exist
 6757            for field in PZfields_INFOS:
 6758                field_ID = PZfields_INFOS[field]["ID"]
 6759                field_description = PZfields_INFOS[field]["Description"]
 6760                if field_ID not in self.get_header().infos and field_ID in pzfields:
 6761                    field_description = (
 6762                        PZfields_INFOS[field]["Description"]
 6763                        + f", profile {default_profile}"
 6764                    )
 6765                    self.get_header().infos[field_ID] = vcf.parser._Info(
 6766                        field_ID,
 6767                        PZfields_INFOS[field]["Number"],
 6768                        PZfields_INFOS[field]["Type"],
 6769                        field_description,
 6770                        "unknown",
 6771                        "unknown",
 6772                        code_type_map[PZfields_INFOS[field]["Type"]],
 6773                    )
 6774
 6775            # Create INFO fields if not exist for each profile
 6776            for profile in prioritizations_config:
 6777                if profile in profiles or profiles == []:
 6778                    for field in PZfields_INFOS:
 6779                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 6780                        field_description = (
 6781                            PZfields_INFOS[field]["Description"]
 6782                            + f", profile {profile}"
 6783                        )
 6784                        if (
 6785                            field_ID not in self.get_header().infos
 6786                            and field in pzfields
 6787                        ):
 6788                            self.get_header().infos[field_ID] = vcf.parser._Info(
 6789                                field_ID,
 6790                                PZfields_INFOS[field]["Number"],
 6791                                PZfields_INFOS[field]["Type"],
 6792                                field_description,
 6793                                "unknown",
 6794                                "unknown",
 6795                                code_type_map[PZfields_INFOS[field]["Type"]],
 6796                            )
 6797
 6798            # Header
 6799            for pzfield in list_of_pzfields:
 6800                if re.match(f"{pz_prefix}Score.*", pzfield):
 6801                    added_column = self.add_column(
 6802                        table_name=table_variants,
 6803                        column_name=pzfield,
 6804                        column_type="INTEGER",
 6805                        default_value="0",
 6806                    )
 6807                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 6808                    added_column = self.add_column(
 6809                        table_name=table_variants,
 6810                        column_name=pzfield,
 6811                        column_type="BOOLEAN",
 6812                        default_value="1",
 6813                    )
 6814                else:
 6815                    added_column = self.add_column(
 6816                        table_name=table_variants,
 6817                        column_name=pzfield,
 6818                        column_type="STRING",
 6819                        default_value="''",
 6820                    )
 6821                added_columns.append(added_column)
 6822
 6823            # Profiles
 6824            if profiles:
 6825
 6826                # foreach profile in configuration file
 6827                for profile in prioritizations_config:
 6828
 6829                    # If profile is asked in param, or ALL are asked (empty profile [])
 6830                    if profile in profiles or profiles == []:
 6831                        log.info(f"Profile '{profile}'")
 6832
 6833                        sql_set_info_option = ""
 6834
 6835                        sql_set_info = []
 6836
 6837                        # PZ fields set
 6838
 6839                        # PZScore
 6840                        if (
 6841                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 6842                            in list_of_pzfields
 6843                        ):
 6844                            sql_set_info.append(
 6845                                f"""
 6846                                    concat(
 6847                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 6848                                        {pz_prefix}Score{pzfields_sep}{profile}
 6849                                    ) 
 6850                                """
 6851                            )
 6852                            if (
 6853                                profile == default_profile
 6854                                and f"{pz_prefix}Score" in list_of_pzfields
 6855                            ):
 6856                                sql_set_info.append(
 6857                                    f"""
 6858                                        concat(
 6859                                            '{pz_prefix}Score=',
 6860                                            {pz_prefix}Score{pzfields_sep}{profile}
 6861                                        )
 6862                                    """
 6863                                )
 6864
 6865                        # PZFlag
 6866                        if (
 6867                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 6868                            in list_of_pzfields
 6869                        ):
 6870                            sql_set_info.append(
 6871                                f"""
 6872                                    concat(
 6873                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 6874                                        CASE 
 6875                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 6876                                            THEN 'PASS'
 6877                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 6878                                            THEN 'FILTERED'
 6879                                        END
 6880                                    ) 
 6881                                """
 6882                            )
 6883                            if (
 6884                                profile == default_profile
 6885                                and f"{pz_prefix}Flag" in list_of_pzfields
 6886                            ):
 6887                                sql_set_info.append(
 6888                                    f"""
 6889                                        concat(
 6890                                            '{pz_prefix}Flag=',
 6891                                            CASE 
 6892                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 6893                                                THEN 'PASS'
 6894                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 6895                                                THEN 'FILTERED'
 6896                                            END
 6897                                        )
 6898                                    """
 6899                                )
 6900
 6901                        # PZComment
 6902                        if (
 6903                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 6904                            in list_of_pzfields
 6905                        ):
 6906                            sql_set_info.append(
 6907                                f"""
 6908                                    CASE
 6909                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 6910                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 6911                                        ELSE ''
 6912                                    END
 6913                                """
 6914                            )
 6915                            if (
 6916                                profile == default_profile
 6917                                and f"{pz_prefix}Comment" in list_of_pzfields
 6918                            ):
 6919                                sql_set_info.append(
 6920                                    f"""
 6921                                        CASE
 6922                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 6923                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 6924                                            ELSE ''
 6925                                        END
 6926                                    """
 6927                                )
 6928
 6929                        # PZInfos
 6930                        if (
 6931                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 6932                            in list_of_pzfields
 6933                        ):
 6934                            sql_set_info.append(
 6935                                f"""
 6936                                    CASE
 6937                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 6938                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 6939                                        ELSE ''
 6940                                    END
 6941                                """
 6942                            )
 6943                            if (
 6944                                profile == default_profile
 6945                                and f"{pz_prefix}Infos" in list_of_pzfields
 6946                            ):
 6947                                sql_set_info.append(
 6948                                    f"""
 6949                                        CASE
 6950                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 6951                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 6952                                            ELSE ''
 6953                                        END
 6954                                    """
 6955                                )
 6956
 6957                        # Merge PZfields
 6958                        sql_set_info_option = ""
 6959                        sql_set_sep = ""
 6960                        for sql_set in sql_set_info:
 6961                            if sql_set_sep:
 6962                                sql_set_info_option += f"""
 6963                                    , concat('{sql_set_sep}', {sql_set})
 6964                                """
 6965                            else:
 6966                                sql_set_info_option += f"""
 6967                                    , {sql_set}
 6968                                """
 6969                            sql_set_sep = ";"
 6970
 6971                        sql_queries = []
 6972                        for annotation in prioritizations_config[profile]:
 6973
 6974                            # Explode specific annotation
 6975                            log.debug(f"Explode annotation '{annotation}'")
 6976                            added_columns += self.explode_infos(
 6977                                prefix=explode_infos_prefix,
 6978                                fields=[annotation],
 6979                                table=table_variants,
 6980                            )
 6981                            extra_infos = self.get_extra_infos(table=table_variants)
 6982
 6983                            # Check if annotation field is present
 6984                            if not f"{explode_infos_prefix}{annotation}" in extra_infos:
 6985                                log.debug(f"Annotation '{annotation}' not in data")
 6986                                continue
 6987                            else:
 6988                                log.debug(f"Annotation '{annotation}' in data")
 6989
 6990                            # For each criterions
 6991                            for criterion in prioritizations_config[profile][
 6992                                annotation
 6993                            ]:
 6994                                criterion_type = criterion["type"]
 6995                                criterion_value = criterion["value"]
 6996                                criterion_score = criterion.get("score", 0)
 6997                                criterion_flag = criterion.get("flag", "PASS")
 6998                                criterion_flag_bool = criterion_flag == "PASS"
 6999                                criterion_comment = (
 7000                                    ", ".join(criterion.get("comment", []))
 7001                                    .replace("'", "''")
 7002                                    .replace(";", ",")
 7003                                    .replace("\t", " ")
 7004                                )
 7005                                criterion_infos = (
 7006                                    str(criterion)
 7007                                    .replace("'", "''")
 7008                                    .replace(";", ",")
 7009                                    .replace("\t", " ")
 7010                                )
 7011
 7012                                sql_set = []
 7013                                sql_set_info = []
 7014
 7015                                # PZ fields set
 7016                                if (
 7017                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7018                                    in list_of_pzfields
 7019                                ):
 7020                                    if prioritization_score_mode == "HOWARD":
 7021                                        sql_set.append(
 7022                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7023                                        )
 7024                                    elif prioritization_score_mode == "VaRank":
 7025                                        sql_set.append(
 7026                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
 7027                                        )
 7028                                    else:
 7029                                        sql_set.append(
 7030                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7031                                        )
 7032                                if (
 7033                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7034                                    in list_of_pzfields
 7035                                ):
 7036                                    sql_set.append(
 7037                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7038                                    )
 7039                                if (
 7040                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7041                                    in list_of_pzfields
 7042                                ):
 7043                                    sql_set.append(
 7044                                        f"""
 7045                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7046                                                concat(
 7047                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7048                                                    CASE 
 7049                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7050                                                        THEN ', '
 7051                                                        ELSE ''
 7052                                                    END,
 7053                                                    '{criterion_comment}'
 7054                                                )
 7055                                        """
 7056                                    )
 7057                                if (
 7058                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7059                                    in list_of_pzfields
 7060                                ):
 7061                                    sql_set.append(
 7062                                        f"""
 7063                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7064                                                concat(
 7065                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7066                                                    '{criterion_infos}'
 7067                                                )
 7068                                        """
 7069                                    )
 7070                                sql_set_option = ",".join(sql_set)
 7071
 7072                                # Criterion and comparison
 7073                                if sql_set_option:
 7074                                    try:
 7075                                        float(criterion_value)
 7076                                        sql_update = f"""
 7077                                            UPDATE {table_variants}
 7078                                            SET {sql_set_option}
 7079                                            WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7080                                            AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7081                                            """
 7082                                    except:
 7083                                        contains_option = ""
 7084                                        if criterion_type == "contains":
 7085                                            contains_option = ".*"
 7086                                        sql_update = f"""
 7087                                            UPDATE {table_variants}
 7088                                            SET {sql_set_option}
 7089                                            WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7090                                            """
 7091                                    sql_queries.append(sql_update)
 7092                                else:
 7093                                    log.warning(
 7094                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7095                                    )
 7096
 7097                        # PZTags
 7098                        if (
 7099                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7100                            in list_of_pzfields
 7101                        ):
 7102
 7103                            # Create PZFalgs value
 7104                            pztags_value = ""
 7105                            pztags_sep_default = "|"
 7106                            pztags_sep = ""
 7107                            for pzfield in pzfields:
 7108                                if pzfield not in [f"{pz_prefix}Tags"]:
 7109                                    if (
 7110                                        f"{pzfield}{pzfields_sep}{profile}"
 7111                                        in list_of_pzfields
 7112                                    ):
 7113                                        if pzfield in [f"{pz_prefix}Flag"]:
 7114                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7115                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7116                                                    THEN 'PASS'
 7117                                                    ELSE 'FILTERED'
 7118                                                END, '"""
 7119                                        else:
 7120                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7121                                        pztags_sep = pztags_sep_default
 7122
 7123                            # Add Query update for PZFlags
 7124                            sql_update_pztags = f"""
 7125                                UPDATE {table_variants}
 7126                                SET INFO = concat(
 7127                                        INFO,
 7128                                        CASE WHEN INFO NOT in ('','.')
 7129                                                THEN ';'
 7130                                                ELSE ''
 7131                                        END,
 7132                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7133                                    )
 7134                                """
 7135                            sql_queries.append(sql_update_pztags)
 7136
 7137                            # Add Query update for PZFlags for default
 7138                            if profile == default_profile:
 7139                                sql_update_pztags_default = f"""
 7140                                UPDATE {table_variants}
 7141                                SET INFO = concat(
 7142                                        INFO,
 7143                                        ';',
 7144                                        '{pz_prefix}Tags={pztags_value}'
 7145                                    )
 7146                                """
 7147                                sql_queries.append(sql_update_pztags_default)
 7148
 7149                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7150
 7151                        if sql_queries:
 7152
 7153                            for sql_query in sql_queries:
 7154                                log.debug(
 7155                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7156                                )
 7157                                self.conn.execute(sql_query)
 7158
 7159                        log.info(f"""Profile '{profile}' - Update... """)
 7160                        sql_query_update = f"""
 7161                            UPDATE {table_variants}
 7162                            SET INFO =  
 7163                                concat(
 7164                                    CASE
 7165                                        WHEN INFO NOT IN ('','.')
 7166                                        THEN concat(INFO, ';')
 7167                                        ELSE ''
 7168                                    END
 7169                                    {sql_set_info_option}
 7170                                )
 7171                        """
 7172                        self.conn.execute(sql_query_update)
 7173
 7174        else:
 7175
 7176            log.warning(f"No profiles in parameters")
 7177
 7178        # Remove added columns
 7179        for added_column in added_columns:
 7180            self.drop_column(column=added_column)
 7181
 7182        # Explode INFOS fields into table fields
 7183        if self.get_explode_infos():
 7184            self.explode_infos(
 7185                prefix=self.get_explode_infos_prefix(),
 7186                fields=self.get_explode_infos_fields(),
 7187                force=True,
 7188            )
 7189
 7190        return True
 7191
 7192    ###
 7193    # HGVS
 7194    ###
 7195
 7196    def annotation_hgvs(self, threads: int = None) -> None:
 7197        """
 7198        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7199        coordinates and alleles.
 7200
 7201        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7202        threads to use for parallel processing. If no value is provided, it will default to the number
 7203        of threads obtained from the `get_threads()` method
 7204        :type threads: int
 7205        """
 7206
 7207        # Function for each partition of the Dask Dataframe
 7208        def partition_function(partition):
 7209            """
 7210            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7211            each row of a DataFrame called `partition`.
 7212
 7213            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7214            to be processed
 7215            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7216            the "partition" dataframe along the axis 1.
 7217            """
 7218            return partition.apply(annotation_hgvs_partition, axis=1)
 7219
 7220        def annotation_hgvs_partition(row) -> str:
 7221            """
 7222            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7223            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7224
 7225            :param row: A dictionary-like object that contains the values for the following keys:
 7226            :return: a string that contains the HGVS names associated with the given row of data.
 7227            """
 7228
 7229            chr = row["CHROM"]
 7230            pos = row["POS"]
 7231            ref = row["REF"]
 7232            alt = row["ALT"]
 7233
 7234            # Find list of associated transcripts
 7235            transcripts_list = list(
 7236                polars_conn.execute(
 7237                    f"""
 7238                SELECT transcript
 7239                FROM refseq_df
 7240                WHERE CHROM='{chr}'
 7241                AND POS={pos}
 7242            """
 7243                )["transcript"]
 7244            )
 7245
 7246            # Full HGVS annotation in list
 7247            hgvs_full_list = []
 7248
 7249            for transcript_name in transcripts_list:
 7250
 7251                # Transcript
 7252                transcript = get_transcript(
 7253                    transcripts=transcripts, transcript_name=transcript_name
 7254                )
 7255                # Exon
 7256                if use_exon:
 7257                    exon = transcript.find_exon_number(pos)
 7258                else:
 7259                    exon = None
 7260                # Protein
 7261                transcript_protein = None
 7262                if use_protein or add_protein or full_format:
 7263                    transcripts_protein = list(
 7264                        polars_conn.execute(
 7265                            f"""
 7266                        SELECT protein
 7267                        FROM refseqlink_df
 7268                        WHERE transcript='{transcript_name}'
 7269                        LIMIT 1
 7270                    """
 7271                        )["protein"]
 7272                    )
 7273                    if len(transcripts_protein):
 7274                        transcript_protein = transcripts_protein[0]
 7275
 7276                # HGVS name
 7277                hgvs_name = format_hgvs_name(
 7278                    chr,
 7279                    pos,
 7280                    ref,
 7281                    alt,
 7282                    genome=genome,
 7283                    transcript=transcript,
 7284                    transcript_protein=transcript_protein,
 7285                    exon=exon,
 7286                    use_gene=use_gene,
 7287                    use_protein=use_protein,
 7288                    full_format=full_format,
 7289                    use_version=use_version,
 7290                    codon_type=codon_type,
 7291                )
 7292                hgvs_full_list.append(hgvs_name)
 7293                if add_protein and not use_protein and not full_format:
 7294                    hgvs_name = format_hgvs_name(
 7295                        chr,
 7296                        pos,
 7297                        ref,
 7298                        alt,
 7299                        genome=genome,
 7300                        transcript=transcript,
 7301                        transcript_protein=transcript_protein,
 7302                        exon=exon,
 7303                        use_gene=use_gene,
 7304                        use_protein=True,
 7305                        full_format=False,
 7306                        use_version=use_version,
 7307                        codon_type=codon_type,
 7308                    )
 7309                    hgvs_full_list.append(hgvs_name)
 7310
 7311            # Create liste of HGVS annotations
 7312            hgvs_full = ",".join(hgvs_full_list)
 7313
 7314            return hgvs_full
 7315
 7316        # Polars connexion
 7317        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7318
 7319        # Config
 7320        config = self.get_config()
 7321
 7322        # Databases
 7323        # Genome
 7324        databases_genomes_folders = (
 7325            config.get("folders", {})
 7326            .get("databases", {})
 7327            .get("genomes", DEFAULT_GENOME_FOLDER)
 7328        )
 7329        databases_genome = (
 7330            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7331        )
 7332        # refseq database folder
 7333        databases_refseq_folders = (
 7334            config.get("folders", {})
 7335            .get("databases", {})
 7336            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7337        )
 7338        # refseq
 7339        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7340        # refSeqLink
 7341        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7342
 7343        # Param
 7344        param = self.get_param()
 7345
 7346        # Quick HGVS
 7347        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7348            log.info(f"Quick HGVS Annotation:")
 7349            if not param.get("hgvs", None):
 7350                param["hgvs"] = {}
 7351            for option in param.get("hgvs_options", "").split(","):
 7352                option_var_val = option.split("=")
 7353                option_var = option_var_val[0]
 7354                if len(option_var_val) > 1:
 7355                    option_val = option_var_val[1]
 7356                else:
 7357                    option_val = "True"
 7358                if option_val.upper() in ["TRUE"]:
 7359                    option_val = True
 7360                elif option_val.upper() in ["FALSE"]:
 7361                    option_val = False
 7362                log.info(f"   {option_var}={option_val}")
 7363                param["hgvs"][option_var] = option_val
 7364
 7365        # Check if HGVS annotation enabled
 7366        if "hgvs" in param:
 7367            log.info(f"HGVS Annotation... ")
 7368            for hgvs_option in param.get("hgvs", {}):
 7369                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7370        else:
 7371            return
 7372
 7373        # HGVS Param
 7374        param_hgvs = param.get("hgvs", {})
 7375        use_exon = param_hgvs.get("use_exon", False)
 7376        use_gene = param_hgvs.get("use_gene", False)
 7377        use_protein = param_hgvs.get("use_protein", False)
 7378        add_protein = param_hgvs.get("add_protein", False)
 7379        full_format = param_hgvs.get("full_format", False)
 7380        use_version = param_hgvs.get("use_version", False)
 7381        codon_type = param_hgvs.get("codon_type", "3")
 7382
 7383        # refSseq refSeqLink
 7384        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 7385        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 7386
 7387        # Assembly
 7388        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 7389
 7390        # Genome
 7391        genome_file = None
 7392        if find_genome(databases_genome):
 7393            genome_file = find_genome(databases_genome)
 7394        else:
 7395            genome_file = find_genome(
 7396                genome_path=databases_genomes_folders, assembly=assembly
 7397            )
 7398        log.debug("Genome: " + str(genome_file))
 7399
 7400        # refSseq
 7401        refseq_file = find_file_prefix(
 7402            input_file=databases_refseq,
 7403            prefix="ncbiRefSeq",
 7404            folder=databases_refseq_folders,
 7405            assembly=assembly,
 7406        )
 7407        log.debug("refSeq: " + str(refseq_file))
 7408
 7409        # refSeqLink
 7410        refseqlink_file = find_file_prefix(
 7411            input_file=databases_refseqlink,
 7412            prefix="ncbiRefSeqLink",
 7413            folder=databases_refseq_folders,
 7414            assembly=assembly,
 7415        )
 7416        log.debug("refSeqLink: " + str(refseqlink_file))
 7417
 7418        # Threads
 7419        if not threads:
 7420            threads = self.get_threads()
 7421        log.debug("Threads: " + str(threads))
 7422
 7423        # Variables
 7424        table_variants = self.get_table_variants(clause="update")
 7425
 7426        # Get variants SNV and InDel only
 7427        query_variants = f"""
 7428            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 7429            FROM {table_variants}
 7430            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 7431            """
 7432        df_variants = self.get_query_to_df(query_variants)
 7433
 7434        # Added columns
 7435        added_columns = []
 7436
 7437        # Add hgvs column in variants table
 7438        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 7439        added_column = self.add_column(
 7440            table_variants, hgvs_column_name, "STRING", default_value=None
 7441        )
 7442        added_columns.append(added_column)
 7443
 7444        log.debug(f"refSeq loading...")
 7445        # refSeq in duckDB
 7446        refseq_table = get_refseq_table(
 7447            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 7448        )
 7449        # Loading all refSeq in Dataframe
 7450        refseq_query = f"""
 7451            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 7452            FROM {refseq_table}
 7453            JOIN df_variants ON (
 7454                {refseq_table}.chrom = df_variants.CHROM
 7455                AND {refseq_table}.txStart<=df_variants.POS
 7456                AND {refseq_table}.txEnd>=df_variants.POS
 7457            )
 7458        """
 7459        refseq_df = self.conn.query(refseq_query).pl()
 7460
 7461        if refseqlink_file:
 7462            log.debug(f"refSeqLink loading...")
 7463            # refSeqLink in duckDB
 7464            refseqlink_table = get_refseq_table(
 7465                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 7466            )
 7467            # Loading all refSeqLink in Dataframe
 7468            protacc_column = "protAcc_with_ver"
 7469            mrnaacc_column = "mrnaAcc_with_ver"
 7470            refseqlink_query = f"""
 7471                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 7472                FROM {refseqlink_table} 
 7473                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 7474                WHERE protAcc_without_ver IS NOT NULL
 7475            """
 7476            # Polars Dataframe
 7477            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 7478
 7479        # Read RefSeq transcripts into a python dict/model.
 7480        log.debug(f"Transcripts loading...")
 7481        with tempfile.TemporaryDirectory() as tmpdir:
 7482            transcripts_query = f"""
 7483                COPY (
 7484                    SELECT {refseq_table}.*
 7485                    FROM {refseq_table}
 7486                    JOIN df_variants ON (
 7487                        {refseq_table}.chrom=df_variants.CHROM
 7488                        AND {refseq_table}.txStart<=df_variants.POS
 7489                        AND {refseq_table}.txEnd>=df_variants.POS
 7490                    )
 7491                )
 7492                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 7493            """
 7494            self.conn.query(transcripts_query)
 7495            with open(f"{tmpdir}/transcript.tsv") as infile:
 7496                transcripts = read_transcripts(infile)
 7497
 7498        # Polars connexion
 7499        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7500
 7501        log.debug("Genome loading...")
 7502        # Read genome sequence using pyfaidx.
 7503        genome = Fasta(genome_file)
 7504
 7505        log.debug("Start annotation HGVS...")
 7506
 7507        # Create
 7508        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 7509        ddf = dd.from_pandas(df_variants, npartitions=threads)
 7510
 7511        # Use dask.dataframe.apply() to apply function on each partition
 7512        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 7513
 7514        # Convert Dask DataFrame to Pandas Dataframe
 7515        df = ddf.compute()
 7516
 7517        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 7518        with tempfile.TemporaryDirectory() as tmpdir:
 7519            df_parquet = os.path.join(tmpdir, "df.parquet")
 7520            df.to_parquet(df_parquet)
 7521
 7522            # Update hgvs column
 7523            update_variant_query = f"""
 7524                UPDATE {table_variants}
 7525                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 7526                FROM read_parquet('{df_parquet}') as df
 7527                WHERE variants."#CHROM" = df.CHROM
 7528                AND variants.POS = df.POS
 7529                AND variants.REF = df.REF
 7530                AND variants.ALT = df.ALT
 7531                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 7532                """
 7533            self.execute_query(update_variant_query)
 7534
 7535        # Update INFO column
 7536        sql_query_update = f"""
 7537            UPDATE {table_variants}
 7538            SET INFO = 
 7539                concat(
 7540                    CASE 
 7541                        WHEN INFO NOT IN ('','.')
 7542                        THEN concat(INFO, ';')
 7543                        ELSE ''
 7544                    END,
 7545                    'hgvs=',
 7546                    {hgvs_column_name}
 7547                )
 7548            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 7549            """
 7550        self.execute_query(sql_query_update)
 7551
 7552        # Add header
 7553        HGVS_INFOS = {
 7554            "hgvs": {
 7555                "ID": "hgvs",
 7556                "Number": ".",
 7557                "Type": "String",
 7558                "Description": f"HGVS annotatation with HOWARD",
 7559            }
 7560        }
 7561
 7562        for field in HGVS_INFOS:
 7563            field_ID = HGVS_INFOS[field]["ID"]
 7564            field_description = HGVS_INFOS[field]["Description"]
 7565            self.get_header().infos[field_ID] = vcf.parser._Info(
 7566                field_ID,
 7567                HGVS_INFOS[field]["Number"],
 7568                HGVS_INFOS[field]["Type"],
 7569                field_description,
 7570                "unknown",
 7571                "unknown",
 7572                code_type_map[HGVS_INFOS[field]["Type"]],
 7573            )
 7574
 7575        # Remove added columns
 7576        for added_column in added_columns:
 7577            self.drop_column(column=added_column)
 7578
 7579    ###
 7580    # Calculation
 7581    ###
 7582
 7583    def get_operations_help(
 7584        self, operations_config_dict: dict = {}, operations_config_file: str = None
 7585    ) -> list:
 7586
 7587        # Init
 7588        operations_help = []
 7589
 7590        # operations
 7591        operations = self.get_config_json(
 7592            name="calculations",
 7593            config_dict=operations_config_dict,
 7594            config_file=operations_config_file,
 7595        )
 7596        for op in operations:
 7597            op_name = operations[op].get("name", op).upper()
 7598            op_description = operations[op].get("description", op_name)
 7599            op_available = operations[op].get("available", False)
 7600            if op_available:
 7601                operations_help.append(f"   {op_name}: {op_description}")
 7602
 7603        # Sort operations
 7604        operations_help.sort()
 7605
 7606        # insert header
 7607        operations_help.insert(0, "Available calculation operations:")
 7608
 7609        # Return
 7610        return operations_help
 7611
 7612    def calculation(
 7613        self,
 7614        operations: dict = {},
 7615        operations_config_dict: dict = {},
 7616        operations_config_file: str = None,
 7617    ) -> None:
 7618        """
 7619        It takes a list of operations, and for each operation, it checks if it's a python or sql
 7620        operation, and then calls the appropriate function
 7621
 7622        param json example:
 7623            "calculation": {
 7624                "NOMEN": {
 7625                    "options": {
 7626                        "hgvs_field": "hgvs"
 7627                    },
 7628                "middle" : null
 7629            }
 7630        """
 7631
 7632        # Param
 7633        param = self.get_param()
 7634
 7635        # operations config
 7636        operations_config = self.get_config_json(
 7637            name="calculations",
 7638            config_dict=operations_config_dict,
 7639            config_file=operations_config_file,
 7640        )
 7641
 7642        # Upper keys
 7643        operations_config = {k.upper(): v for k, v in operations_config.items()}
 7644
 7645        # Calculations
 7646
 7647        # Operations from param
 7648        operations = param.get("calculation", {}).get("calculations", operations)
 7649
 7650        # Quick calculation - add
 7651        if param.get("calculations", None):
 7652            calculations_list = [
 7653                value for value in param.get("calculations", "").split(",")
 7654            ]
 7655            log.info(f"Quick Calculations:")
 7656            for calculation_key in calculations_list:
 7657                log.info(f"   {calculation_key}")
 7658            for calculation_operation in calculations_list:
 7659                if calculation_operation.upper() not in operations:
 7660                    operations[calculation_operation.upper()] = {}
 7661                    add_value_into_dict(
 7662                        dict_tree=param,
 7663                        sections=[
 7664                            "calculation",
 7665                            "calculations",
 7666                            calculation_operation.upper(),
 7667                        ],
 7668                        value={},
 7669                    )
 7670
 7671        # Operations for calculation
 7672        if not operations:
 7673            operations = param.get("calculation", {}).get("calculations", {})
 7674
 7675        if operations:
 7676            log.info(f"Calculations...")
 7677
 7678        # For each operations
 7679        for operation_name in operations:
 7680            operation_name = operation_name.upper()
 7681            if operation_name not in [""]:
 7682                if operation_name in operations_config:
 7683                    log.info(f"Calculation '{operation_name}'")
 7684                    operation = operations_config[operation_name]
 7685                    operation_type = operation.get("type", "sql")
 7686                    if operation_type == "python":
 7687                        self.calculation_process_function(
 7688                            operation=operation, operation_name=operation_name
 7689                        )
 7690                    elif operation_type == "sql":
 7691                        self.calculation_process_sql(
 7692                            operation=operation, operation_name=operation_name
 7693                        )
 7694                    else:
 7695                        log.error(
 7696                            f"Operations config: Type '{operation_type}' NOT available"
 7697                        )
 7698                        raise ValueError(
 7699                            f"Operations config: Type '{operation_type}' NOT available"
 7700                        )
 7701                else:
 7702                    log.error(
 7703                        f"Operations config: Calculation '{operation_name}' NOT available"
 7704                    )
 7705                    raise ValueError(
 7706                        f"Operations config: Calculation '{operation_name}' NOT available"
 7707                    )
 7708
 7709        # Explode INFOS fields into table fields
 7710        if self.get_explode_infos():
 7711            self.explode_infos(
 7712                prefix=self.get_explode_infos_prefix(),
 7713                fields=self.get_explode_infos_fields(),
 7714                force=True,
 7715            )
 7716
 7717    def calculation_process_sql(
 7718        self, operation: dict, operation_name: str = "unknown"
 7719    ) -> None:
 7720        """
 7721        The `calculation_process_sql` function takes in a mathematical operation as a string and
 7722        performs the operation, updating the specified table with the result.
 7723
 7724        :param operation: The `operation` parameter is a dictionary that contains information about the
 7725        mathematical operation to be performed. It includes the following keys:
 7726        :type operation: dict
 7727        :param operation_name: The `operation_name` parameter is a string that represents the name of
 7728        the mathematical operation being performed. It is used for logging and error handling purposes,
 7729        defaults to unknown
 7730        :type operation_name: str (optional)
 7731        """
 7732
 7733        # table variants
 7734        table_variants = self.get_table_variants(clause="alter")
 7735
 7736        # Operation infos
 7737        operation_name = operation.get("name", "unknown")
 7738        log.debug(f"process sql {operation_name}")
 7739        output_column_name = operation.get("output_column_name", operation_name)
 7740        output_column_type = operation.get("output_column_type", "String")
 7741        prefix = operation.get("explode_infos_prefix", "")
 7742        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 7743        output_column_description = operation.get(
 7744            "output_column_description", f"{operation_name} operation"
 7745        )
 7746        operation_query = operation.get("operation_query", None)
 7747        if isinstance(operation_query, list):
 7748            operation_query = " ".join(operation_query)
 7749        operation_info_fields = operation.get("info_fields", [])
 7750        operation_info_fields_check = operation.get("info_fields_check", False)
 7751        operation_info = operation.get("operation_info", True)
 7752
 7753        if operation_query:
 7754
 7755            # Info fields check
 7756            operation_info_fields_check_result = True
 7757            if operation_info_fields_check:
 7758                header_infos = self.get_header().infos
 7759                for info_field in operation_info_fields:
 7760                    operation_info_fields_check_result = (
 7761                        operation_info_fields_check_result
 7762                        and info_field in header_infos
 7763                    )
 7764
 7765            # If info fields available
 7766            if operation_info_fields_check_result:
 7767
 7768                # Added_columns
 7769                added_columns = []
 7770
 7771                # Create VCF header field
 7772                vcf_reader = self.get_header()
 7773                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 7774                    output_column_name,
 7775                    ".",
 7776                    output_column_type,
 7777                    output_column_description,
 7778                    "howard calculation",
 7779                    "0",
 7780                    self.code_type_map.get(output_column_type),
 7781                )
 7782
 7783                # Explode infos if needed
 7784                log.debug(f"calculation_process_sql prefix {prefix}")
 7785                added_columns += self.explode_infos(
 7786                    prefix=prefix,
 7787                    fields=[output_column_name] + operation_info_fields,
 7788                    force=True,
 7789                )
 7790
 7791                # Create column
 7792                added_column = self.add_column(
 7793                    table_name=table_variants,
 7794                    column_name=prefix + output_column_name,
 7795                    column_type=output_column_type_sql,
 7796                    default_value="null",
 7797                )
 7798                added_columns.append(added_column)
 7799
 7800                # Operation calculation
 7801                try:
 7802
 7803                    # Query to update calculation column
 7804                    sql_update = f"""
 7805                        UPDATE {table_variants}
 7806                        SET "{prefix}{output_column_name}" = ({operation_query})
 7807                    """
 7808                    self.conn.execute(sql_update)
 7809
 7810                    # Add to INFO
 7811                    if operation_info:
 7812                        sql_update_info = f"""
 7813                            UPDATE {table_variants}
 7814                            SET "INFO" =
 7815                                concat(
 7816                                    CASE
 7817                                        WHEN "INFO" IS NOT NULL
 7818                                        THEN concat("INFO", ';')
 7819                                        ELSE ''
 7820                                    END,
 7821                                    '{output_column_name}=',
 7822                                    "{prefix}{output_column_name}"
 7823                                )
 7824                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 7825                        """
 7826                        self.conn.execute(sql_update_info)
 7827
 7828                except:
 7829                    log.error(
 7830                        f"Operations config: Calculation '{operation_name}' query failed"
 7831                    )
 7832                    raise ValueError(
 7833                        f"Operations config: Calculation '{operation_name}' query failed"
 7834                    )
 7835
 7836                # Remove added columns
 7837                for added_column in added_columns:
 7838                    log.debug(f"added_column: {added_column}")
 7839                    self.drop_column(column=added_column)
 7840
 7841            else:
 7842                log.error(
 7843                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 7844                )
 7845                raise ValueError(
 7846                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 7847                )
 7848
 7849        else:
 7850            log.error(
 7851                f"Operations config: Calculation '{operation_name}' query NOT defined"
 7852            )
 7853            raise ValueError(
 7854                f"Operations config: Calculation '{operation_name}' query NOT defined"
 7855            )
 7856
 7857    def calculation_process_function(
 7858        self, operation: dict, operation_name: str = "unknown"
 7859    ) -> None:
 7860        """
 7861        The `calculation_process_function` takes in an operation dictionary and performs the specified
 7862        function with the given parameters.
 7863
 7864        :param operation: The `operation` parameter is a dictionary that contains information about the
 7865        operation to be performed. It has the following keys:
 7866        :type operation: dict
 7867        :param operation_name: The `operation_name` parameter is a string that represents the name of
 7868        the operation being performed. It is used for logging purposes, defaults to unknown
 7869        :type operation_name: str (optional)
 7870        """
 7871
 7872        operation_name = operation["name"]
 7873        log.debug(f"process sql {operation_name}")
 7874        function_name = operation["function_name"]
 7875        function_params = operation["function_params"]
 7876        getattr(self, function_name)(*function_params)
 7877
 7878    def calculation_variant_id(self) -> None:
 7879        """
 7880        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 7881        updates the INFO field of a variants table with the variant ID.
 7882        """
 7883
 7884        # variant_id annotation field
 7885        variant_id_tag = self.get_variant_id_column()
 7886        added_columns = [variant_id_tag]
 7887
 7888        # variant_id hgvs tags"
 7889        vcf_infos_tags = {
 7890            variant_id_tag: "howard variant ID annotation",
 7891        }
 7892
 7893        # Variants table
 7894        table_variants = self.get_table_variants()
 7895
 7896        # Header
 7897        vcf_reader = self.get_header()
 7898
 7899        # Add variant_id to header
 7900        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 7901            variant_id_tag,
 7902            ".",
 7903            "String",
 7904            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 7905            "howard calculation",
 7906            "0",
 7907            self.code_type_map.get("String"),
 7908        )
 7909
 7910        # Update
 7911        sql_update = f"""
 7912            UPDATE {table_variants}
 7913            SET "INFO" = 
 7914                concat(
 7915                    CASE
 7916                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 7917                        THEN ''
 7918                        ELSE concat("INFO", ';')
 7919                    END,
 7920                    '{variant_id_tag}=',
 7921                    "{variant_id_tag}"
 7922                )
 7923        """
 7924        self.conn.execute(sql_update)
 7925
 7926        # Remove added columns
 7927        for added_column in added_columns:
 7928            self.drop_column(column=added_column)
 7929
 7930    def calculation_extract_snpeff_hgvs(
 7931        self,
 7932        snpeff_hgvs: str = "snpeff_hgvs",
 7933        snpeff_field: str = "ANN",
 7934    ) -> None:
 7935        """
 7936        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 7937        annotation field in a VCF file and adds them as a new column in the variants table.
 7938
 7939        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 7940        function is used to specify the name of the column that will store the HGVS nomenclatures
 7941        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 7942        snpeff_hgvs
 7943        :type snpeff_hgvs: str (optional)
 7944        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 7945        function represents the field in the VCF file that contains SnpEff annotations. This field is
 7946        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 7947        to ANN
 7948        :type snpeff_field: str (optional)
 7949        """
 7950
 7951        # Snpeff hgvs tags
 7952        vcf_infos_tags = {
 7953            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 7954        }
 7955
 7956        # Prefix
 7957        prefix = self.get_explode_infos_prefix()
 7958        if prefix:
 7959            prefix = "INFO/"
 7960
 7961        # snpEff fields
 7962        speff_ann_infos = prefix + snpeff_field
 7963        speff_hgvs_infos = prefix + snpeff_hgvs
 7964
 7965        # Variants table
 7966        table_variants = self.get_table_variants()
 7967
 7968        # Header
 7969        vcf_reader = self.get_header()
 7970
 7971        # Add columns
 7972        added_columns = []
 7973
 7974        # Explode HGVS field in column
 7975        added_columns += self.explode_infos(fields=[snpeff_field])
 7976
 7977        if snpeff_field in vcf_reader.infos:
 7978
 7979            log.debug(vcf_reader.infos[snpeff_field])
 7980
 7981            # Extract ANN header
 7982            ann_description = vcf_reader.infos[snpeff_field].desc
 7983            pattern = r"'(.+?)'"
 7984            match = re.search(pattern, ann_description)
 7985            if match:
 7986                ann_header_match = match.group(1).split(" | ")
 7987                ann_header_desc = {}
 7988                for i in range(len(ann_header_match)):
 7989                    ann_header_info = "".join(
 7990                        char for char in ann_header_match[i] if char.isalnum()
 7991                    )
 7992                    ann_header_desc[ann_header_info] = ann_header_match[i]
 7993                if not ann_header_desc:
 7994                    raise ValueError("Invalid header description format")
 7995            else:
 7996                raise ValueError("Invalid header description format")
 7997
 7998            # Create variant id
 7999            variant_id_column = self.get_variant_id_column()
 8000            added_columns += [variant_id_column]
 8001
 8002            # Create dataframe
 8003            dataframe_snpeff_hgvs = self.get_query_to_df(
 8004                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8005            )
 8006
 8007            # Create main NOMEN column
 8008            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8009                speff_ann_infos
 8010            ].apply(
 8011                lambda x: extract_snpeff_hgvs(
 8012                    str(x), header=list(ann_header_desc.values())
 8013                )
 8014            )
 8015
 8016            # Add snpeff_hgvs to header
 8017            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8018                snpeff_hgvs,
 8019                ".",
 8020                "String",
 8021                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8022                "howard calculation",
 8023                "0",
 8024                self.code_type_map.get("String"),
 8025            )
 8026
 8027            # Update
 8028            sql_update = f"""
 8029                UPDATE variants
 8030                SET "INFO" = 
 8031                    concat(
 8032                        CASE
 8033                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8034                            THEN ''
 8035                            ELSE concat("INFO", ';')
 8036                        END,
 8037                        CASE 
 8038                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8039                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8040                            THEN concat(
 8041                                    '{snpeff_hgvs}=',
 8042                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8043                                )
 8044                            ELSE ''
 8045                        END
 8046                    )
 8047                FROM dataframe_snpeff_hgvs
 8048                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8049
 8050            """
 8051            self.conn.execute(sql_update)
 8052
 8053            # Delete dataframe
 8054            del dataframe_snpeff_hgvs
 8055            gc.collect()
 8056
 8057        else:
 8058
 8059            log.warning(
 8060                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8061            )
 8062
 8063        # Remove added columns
 8064        for added_column in added_columns:
 8065            self.drop_column(column=added_column)
 8066
 8067    def calculation_snpeff_ann_explode(
 8068        self,
 8069        uniquify: bool = True,
 8070        output_format: str = "fields",
 8071        output_prefix: str = "snpeff_",
 8072        snpeff_field: str = "ANN",
 8073    ) -> None:
 8074        """
 8075        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8076        exploding the HGVS field and updating variant information accordingly.
 8077
 8078        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8079        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8080        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8081        defaults to True
 8082        :type uniquify: bool (optional)
 8083        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8084        function specifies the format in which the output annotations will be generated. It has a
 8085        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8086        format, defaults to fields
 8087        :type output_format: str (optional)
 8088        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8089        method is used to specify the prefix that will be added to the output annotations generated
 8090        during the calculation process. This prefix helps to differentiate the newly added annotations
 8091        from existing ones in the output data. By default, the, defaults to ANN_
 8092        :type output_prefix: str (optional)
 8093        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8094        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8095        field will be processed to explode the HGVS annotations and update the variant information
 8096        accordingly, defaults to ANN
 8097        :type snpeff_field: str (optional)
 8098        """
 8099
 8100        # SnpEff annotation field
 8101        snpeff_hgvs = "snpeff_ann_explode"
 8102
 8103        # Snpeff hgvs tags
 8104        vcf_infos_tags = {
 8105            snpeff_hgvs: "Explode snpEff annotations",
 8106        }
 8107
 8108        # Prefix
 8109        prefix = self.get_explode_infos_prefix()
 8110        if prefix:
 8111            prefix = "INFO/"
 8112
 8113        # snpEff fields
 8114        speff_ann_infos = prefix + snpeff_field
 8115        speff_hgvs_infos = prefix + snpeff_hgvs
 8116
 8117        # Variants table
 8118        table_variants = self.get_table_variants()
 8119
 8120        # Header
 8121        vcf_reader = self.get_header()
 8122
 8123        # Add columns
 8124        added_columns = []
 8125
 8126        # Explode HGVS field in column
 8127        added_columns += self.explode_infos(fields=[snpeff_field])
 8128        log.debug(f"snpeff_field={snpeff_field}")
 8129        log.debug(f"added_columns={added_columns}")
 8130
 8131        if snpeff_field in vcf_reader.infos:
 8132
 8133            # Extract ANN header
 8134            ann_description = vcf_reader.infos[snpeff_field].desc
 8135            pattern = r"'(.+?)'"
 8136            match = re.search(pattern, ann_description)
 8137            if match:
 8138                ann_header_match = match.group(1).split(" | ")
 8139                ann_header = []
 8140                ann_header_desc = {}
 8141                for i in range(len(ann_header_match)):
 8142                    ann_header_info = "".join(
 8143                        char for char in ann_header_match[i] if char.isalnum()
 8144                    )
 8145                    ann_header.append(ann_header_info)
 8146                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8147                if not ann_header_desc:
 8148                    raise ValueError("Invalid header description format")
 8149            else:
 8150                raise ValueError("Invalid header description format")
 8151
 8152            # Create variant id
 8153            variant_id_column = self.get_variant_id_column()
 8154            added_columns += [variant_id_column]
 8155
 8156            # Create dataframe
 8157            dataframe_snpeff_hgvs = self.get_query_to_df(
 8158                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8159            )
 8160
 8161            # Create snpEff columns
 8162            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8163                speff_ann_infos
 8164            ].apply(
 8165                lambda x: explode_snpeff_ann(
 8166                    str(x),
 8167                    uniquify=uniquify,
 8168                    output_format=output_format,
 8169                    prefix=output_prefix,
 8170                    header=list(ann_header_desc.values()),
 8171                )
 8172            )
 8173
 8174            # Header
 8175            ann_annotations_prefix = ""
 8176            if output_format.upper() in ["JSON"]:
 8177                ann_annotations_prefix = f"{output_prefix}="
 8178                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8179                    output_prefix,
 8180                    ".",
 8181                    "String",
 8182                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8183                    + " - JSON format",
 8184                    "howard calculation",
 8185                    "0",
 8186                    self.code_type_map.get("String"),
 8187                )
 8188            else:
 8189                for ann_annotation in ann_header:
 8190                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8191                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8192                        ann_annotation_id,
 8193                        ".",
 8194                        "String",
 8195                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8196                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8197                        "howard calculation",
 8198                        "0",
 8199                        self.code_type_map.get("String"),
 8200                    )
 8201
 8202            # Update
 8203            sql_update = f"""
 8204                UPDATE variants
 8205                SET "INFO" = 
 8206                    concat(
 8207                        CASE
 8208                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8209                            THEN ''
 8210                            ELSE concat("INFO", ';')
 8211                        END,
 8212                        CASE 
 8213                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8214                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8215                            THEN concat(
 8216                                '{ann_annotations_prefix}',
 8217                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8218                                )
 8219                            ELSE ''
 8220                        END
 8221                    )
 8222                FROM dataframe_snpeff_hgvs
 8223                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8224
 8225            """
 8226            self.conn.execute(sql_update)
 8227
 8228            # Delete dataframe
 8229            del dataframe_snpeff_hgvs
 8230            gc.collect()
 8231
 8232        else:
 8233
 8234            log.warning(
 8235                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8236            )
 8237
 8238        # Remove added columns
 8239        for added_column in added_columns:
 8240            self.drop_column(column=added_column)
 8241
 8242    def calculation_extract_nomen(self) -> None:
 8243        """
 8244        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8245        """
 8246
 8247        # NOMEN field
 8248        field_nomen_dict = "NOMEN_DICT"
 8249
 8250        # NOMEN structure
 8251        nomen_dict = {
 8252            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8253            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8254            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8255            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8256            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8257            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8258            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8259            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8260            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8261            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8262        }
 8263
 8264        # Param
 8265        param = self.get_param()
 8266
 8267        # Prefix
 8268        prefix = self.get_explode_infos_prefix()
 8269
 8270        # Header
 8271        vcf_reader = self.get_header()
 8272
 8273        # Get HGVS field
 8274        hgvs_field = (
 8275            param.get("calculation", {})
 8276            .get("calculations", {})
 8277            .get("NOMEN", {})
 8278            .get("options", {})
 8279            .get("hgvs_field", "hgvs")
 8280        )
 8281
 8282        # Get transcripts
 8283        transcripts_file = (
 8284            param.get("calculation", {})
 8285            .get("calculations", {})
 8286            .get("NOMEN", {})
 8287            .get("options", {})
 8288            .get("transcripts", None)
 8289        )
 8290        transcripts_file = full_path(transcripts_file)
 8291        transcripts = []
 8292        if transcripts_file:
 8293            if os.path.exists(transcripts_file):
 8294                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8295                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
 8296            else:
 8297                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
 8298                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
 8299
 8300        # Added columns
 8301        added_columns = []
 8302
 8303        # Explode HGVS field in column
 8304        added_columns += self.explode_infos(fields=[hgvs_field])
 8305
 8306        # extra infos
 8307        extra_infos = self.get_extra_infos()
 8308        extra_field = prefix + hgvs_field
 8309
 8310        if extra_field in extra_infos:
 8311
 8312            # Create dataframe
 8313            dataframe_hgvs = self.get_query_to_df(
 8314                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
 8315            )
 8316
 8317            # Create main NOMEN column
 8318            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
 8319                lambda x: find_nomen(str(x), transcripts=transcripts)
 8320            )
 8321
 8322            # Explode NOMEN Structure and create SQL set for update
 8323            sql_nomen_fields = []
 8324            for nomen_field in nomen_dict:
 8325
 8326                # Explode each field into a column
 8327                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
 8328                    lambda x: dict(x).get(nomen_field, "")
 8329                )
 8330
 8331                # Create VCF header field
 8332                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 8333                    nomen_field,
 8334                    ".",
 8335                    "String",
 8336                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 8337                    "howard calculation",
 8338                    "0",
 8339                    self.code_type_map.get("String"),
 8340                )
 8341                sql_nomen_fields.append(
 8342                    f"""
 8343                        CASE 
 8344                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
 8345                            THEN concat(
 8346                                    ';{nomen_field}=',
 8347                                    dataframe_hgvs."{nomen_field}"
 8348                                )
 8349                            ELSE ''
 8350                        END
 8351                    """
 8352                )
 8353
 8354            # SQL set for update
 8355            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 8356
 8357            # Update
 8358            sql_update = f"""
 8359                UPDATE variants
 8360                SET "INFO" = 
 8361                    concat(
 8362                        CASE
 8363                            WHEN "INFO" IS NULL
 8364                            THEN ''
 8365                            ELSE "INFO"
 8366                        END,
 8367                        {sql_nomen_fields_set}
 8368                    )
 8369                FROM dataframe_hgvs
 8370                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 8371                    AND variants."POS" = dataframe_hgvs."POS" 
 8372                    AND variants."REF" = dataframe_hgvs."REF"
 8373                    AND variants."ALT" = dataframe_hgvs."ALT"
 8374            """
 8375            self.conn.execute(sql_update)
 8376
 8377            # Delete dataframe
 8378            del dataframe_hgvs
 8379            gc.collect()
 8380
 8381        # Remove added columns
 8382        for added_column in added_columns:
 8383            self.drop_column(column=added_column)
 8384
 8385    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 8386        """
 8387        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 8388        pipeline/sample for a variant and updates the variant information in a VCF file.
 8389
 8390        :param tag: The `tag` parameter is a string that represents the annotation field for the
 8391        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 8392        VCF header and to update the corresponding field in the variants table, defaults to
 8393        findbypipeline
 8394        :type tag: str (optional)
 8395        """
 8396
 8397        # if FORMAT and samples
 8398        if (
 8399            "FORMAT" in self.get_header_columns_as_list()
 8400            and self.get_header_sample_list()
 8401        ):
 8402
 8403            # findbypipeline annotation field
 8404            findbypipeline_tag = tag
 8405
 8406            # VCF infos tags
 8407            vcf_infos_tags = {
 8408                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 8409            }
 8410
 8411            # Prefix
 8412            prefix = self.get_explode_infos_prefix()
 8413
 8414            # Field
 8415            findbypipeline_infos = prefix + findbypipeline_tag
 8416
 8417            # Variants table
 8418            table_variants = self.get_table_variants()
 8419
 8420            # Header
 8421            vcf_reader = self.get_header()
 8422
 8423            # Create variant id
 8424            variant_id_column = self.get_variant_id_column()
 8425            added_columns = [variant_id_column]
 8426
 8427            # variant_id, FORMAT and samples
 8428            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8429                self.get_header_sample_list()
 8430            )
 8431
 8432            # Create dataframe
 8433            dataframe_findbypipeline = self.get_query_to_df(
 8434                f""" SELECT {samples_fields} FROM {table_variants} """
 8435            )
 8436
 8437            # Create findbypipeline column
 8438            dataframe_findbypipeline[findbypipeline_infos] = (
 8439                dataframe_findbypipeline.apply(
 8440                    lambda row: findbypipeline(
 8441                        row, samples=self.get_header_sample_list()
 8442                    ),
 8443                    axis=1,
 8444                )
 8445            )
 8446
 8447            # Add snpeff_hgvs to header
 8448            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 8449                findbypipeline_tag,
 8450                ".",
 8451                "String",
 8452                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 8453                "howard calculation",
 8454                "0",
 8455                self.code_type_map.get("String"),
 8456            )
 8457
 8458            # Update
 8459            sql_update = f"""
 8460                UPDATE variants
 8461                SET "INFO" = 
 8462                    concat(
 8463                        CASE
 8464                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8465                            THEN ''
 8466                            ELSE concat("INFO", ';')
 8467                        END,
 8468                        CASE 
 8469                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 8470                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 8471                            THEN concat(
 8472                                    '{findbypipeline_tag}=',
 8473                                    dataframe_findbypipeline."{findbypipeline_infos}"
 8474                                )
 8475                            ELSE ''
 8476                        END
 8477                    )
 8478                FROM dataframe_findbypipeline
 8479                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 8480            """
 8481            self.conn.execute(sql_update)
 8482
 8483            # Remove added columns
 8484            for added_column in added_columns:
 8485                self.drop_column(column=added_column)
 8486
 8487            # Delete dataframe
 8488            del dataframe_findbypipeline
 8489            gc.collect()
 8490
 8491    def calculation_genotype_concordance(self) -> None:
 8492        """
 8493        The function `calculation_genotype_concordance` calculates the genotype concordance for
 8494        multi-caller VCF files and updates the variant information in the database.
 8495        """
 8496
 8497        # if FORMAT and samples
 8498        if (
 8499            "FORMAT" in self.get_header_columns_as_list()
 8500            and self.get_header_sample_list()
 8501        ):
 8502
 8503            # genotypeconcordance annotation field
 8504            genotypeconcordance_tag = "genotypeconcordance"
 8505
 8506            # VCF infos tags
 8507            vcf_infos_tags = {
 8508                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 8509            }
 8510
 8511            # Prefix
 8512            prefix = self.get_explode_infos_prefix()
 8513
 8514            # Field
 8515            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 8516
 8517            # Variants table
 8518            table_variants = self.get_table_variants()
 8519
 8520            # Header
 8521            vcf_reader = self.get_header()
 8522
 8523            # Create variant id
 8524            variant_id_column = self.get_variant_id_column()
 8525            added_columns = [variant_id_column]
 8526
 8527            # variant_id, FORMAT and samples
 8528            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8529                self.get_header_sample_list()
 8530            )
 8531
 8532            # Create dataframe
 8533            dataframe_genotypeconcordance = self.get_query_to_df(
 8534                f""" SELECT {samples_fields} FROM {table_variants} """
 8535            )
 8536
 8537            # Create genotypeconcordance column
 8538            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 8539                dataframe_genotypeconcordance.apply(
 8540                    lambda row: genotypeconcordance(
 8541                        row, samples=self.get_header_sample_list()
 8542                    ),
 8543                    axis=1,
 8544                )
 8545            )
 8546
 8547            # Add genotypeconcordance to header
 8548            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 8549                genotypeconcordance_tag,
 8550                ".",
 8551                "String",
 8552                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 8553                "howard calculation",
 8554                "0",
 8555                self.code_type_map.get("String"),
 8556            )
 8557
 8558            # Update
 8559            sql_update = f"""
 8560                UPDATE variants
 8561                SET "INFO" = 
 8562                    concat(
 8563                        CASE
 8564                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8565                            THEN ''
 8566                            ELSE concat("INFO", ';')
 8567                        END,
 8568                        CASE
 8569                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 8570                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 8571                            THEN concat(
 8572                                    '{genotypeconcordance_tag}=',
 8573                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 8574                                )
 8575                            ELSE ''
 8576                        END
 8577                    )
 8578                FROM dataframe_genotypeconcordance
 8579                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 8580            """
 8581            self.conn.execute(sql_update)
 8582
 8583            # Remove added columns
 8584            for added_column in added_columns:
 8585                self.drop_column(column=added_column)
 8586
 8587            # Delete dataframe
 8588            del dataframe_genotypeconcordance
 8589            gc.collect()
 8590
 8591    def calculation_barcode(self, tag: str = "barcode") -> None:
 8592        """
 8593        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 8594        updates the INFO field in the file with the calculated barcode values.
 8595
 8596        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 8597        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 8598        the default tag name is set to "barcode", defaults to barcode
 8599        :type tag: str (optional)
 8600        """
 8601
 8602        # if FORMAT and samples
 8603        if (
 8604            "FORMAT" in self.get_header_columns_as_list()
 8605            and self.get_header_sample_list()
 8606        ):
 8607
 8608            # barcode annotation field
 8609            if not tag:
 8610                tag = "barcode"
 8611
 8612            # VCF infos tags
 8613            vcf_infos_tags = {
 8614                tag: "barcode calculation (VaRank)",
 8615            }
 8616
 8617            # Prefix
 8618            prefix = self.get_explode_infos_prefix()
 8619
 8620            # Field
 8621            barcode_infos = prefix + tag
 8622
 8623            # Variants table
 8624            table_variants = self.get_table_variants()
 8625
 8626            # Header
 8627            vcf_reader = self.get_header()
 8628
 8629            # Create variant id
 8630            variant_id_column = self.get_variant_id_column()
 8631            added_columns = [variant_id_column]
 8632
 8633            # variant_id, FORMAT and samples
 8634            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8635                self.get_header_sample_list()
 8636            )
 8637
 8638            # Create dataframe
 8639            dataframe_barcode = self.get_query_to_df(
 8640                f""" SELECT {samples_fields} FROM {table_variants} """
 8641            )
 8642
 8643            # Create barcode column
 8644            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 8645                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 8646            )
 8647
 8648            # Add barcode to header
 8649            vcf_reader.infos[tag] = vcf.parser._Info(
 8650                tag,
 8651                ".",
 8652                "String",
 8653                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 8654                "howard calculation",
 8655                "0",
 8656                self.code_type_map.get("String"),
 8657            )
 8658
 8659            # Update
 8660            sql_update = f"""
 8661                UPDATE {table_variants}
 8662                SET "INFO" = 
 8663                    concat(
 8664                        CASE
 8665                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8666                            THEN ''
 8667                            ELSE concat("INFO", ';')
 8668                        END,
 8669                        CASE
 8670                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 8671                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 8672                            THEN concat(
 8673                                    '{tag}=',
 8674                                    dataframe_barcode."{barcode_infos}"
 8675                                )
 8676                            ELSE ''
 8677                        END
 8678                    )
 8679                FROM dataframe_barcode
 8680                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 8681            """
 8682            self.conn.execute(sql_update)
 8683
 8684            # Remove added columns
 8685            for added_column in added_columns:
 8686                self.drop_column(column=added_column)
 8687
 8688            # Delete dataframe
 8689            del dataframe_barcode
 8690            gc.collect()
 8691
 8692    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 8693        """
 8694        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 8695        and updates the INFO field in the file with the calculated barcode values.
 8696
 8697        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 8698        the barcode tag that will be added to the VCF file during the calculation process. If no value
 8699        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 8700        :type tag: str (optional)
 8701        """
 8702
 8703        # if FORMAT and samples
 8704        if (
 8705            "FORMAT" in self.get_header_columns_as_list()
 8706            and self.get_header_sample_list()
 8707        ):
 8708
 8709            # barcode annotation field
 8710            if not tag:
 8711                tag = "BCF"
 8712
 8713            # VCF infos tags
 8714            vcf_infos_tags = {
 8715                tag: "barcode family calculation",
 8716                f"{tag}S": "barcode family samples",
 8717            }
 8718
 8719            # Param
 8720            param = self.get_param()
 8721            log.debug(f"param={param}")
 8722
 8723            # Prefix
 8724            prefix = self.get_explode_infos_prefix()
 8725
 8726            # PED param
 8727            ped = (
 8728                param.get("calculation", {})
 8729                .get("calculations", {})
 8730                .get("BARCODEFAMILY", {})
 8731                .get("family_pedigree", None)
 8732            )
 8733            log.debug(f"ped={ped}")
 8734
 8735            # Load PED
 8736            if ped:
 8737
 8738                # Pedigree is a file
 8739                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 8740                    log.debug("Pedigree is file")
 8741                    with open(full_path(ped)) as ped:
 8742                        ped = json.load(ped)
 8743
 8744                # Pedigree is a string
 8745                elif isinstance(ped, str):
 8746                    log.debug("Pedigree is str")
 8747                    try:
 8748                        ped = json.loads(ped)
 8749                        log.debug("Pedigree is json str")
 8750                    except ValueError as e:
 8751                        ped_samples = ped.split(",")
 8752                        ped = {}
 8753                        for ped_sample in ped_samples:
 8754                            ped[ped_sample] = ped_sample
 8755
 8756                # Pedigree is a dict
 8757                elif isinstance(ped, dict):
 8758                    log.debug("Pedigree is dict")
 8759
 8760                # Pedigree is not well formatted
 8761                else:
 8762                    msg_error = "Pedigree not well formatted"
 8763                    log.error(msg_error)
 8764                    raise ValueError(msg_error)
 8765
 8766                # Construct list
 8767                ped_samples = list(ped.values())
 8768
 8769            else:
 8770                log.debug("Pedigree not defined. Take all samples")
 8771                ped_samples = self.get_header_sample_list()
 8772                ped = {}
 8773                for ped_sample in ped_samples:
 8774                    ped[ped_sample] = ped_sample
 8775
 8776            # Check pedigree
 8777            if not ped or len(ped) == 0:
 8778                msg_error = f"Error in pedigree: samples {ped_samples}"
 8779                log.error(msg_error)
 8780                raise ValueError(msg_error)
 8781
 8782            # Log
 8783            log.info(
 8784                "Calculation 'BARCODEFAMILY' - Samples: "
 8785                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 8786            )
 8787            log.debug(f"ped_samples={ped_samples}")
 8788
 8789            # Field
 8790            barcode_infos = prefix + tag
 8791
 8792            # Variants table
 8793            table_variants = self.get_table_variants()
 8794
 8795            # Header
 8796            vcf_reader = self.get_header()
 8797
 8798            # Create variant id
 8799            variant_id_column = self.get_variant_id_column()
 8800            added_columns = [variant_id_column]
 8801
 8802            # variant_id, FORMAT and samples
 8803            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8804                ped_samples
 8805            )
 8806
 8807            # Create dataframe
 8808            dataframe_barcode = self.get_query_to_df(
 8809                f""" SELECT {samples_fields} FROM {table_variants} """
 8810            )
 8811
 8812            # Create barcode column
 8813            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 8814                lambda row: barcode(row, samples=ped_samples), axis=1
 8815            )
 8816
 8817            # Add barcode family to header
 8818            # Add vaf_normalization to header
 8819            vcf_reader.formats[tag] = vcf.parser._Format(
 8820                id=tag,
 8821                num=".",
 8822                type="String",
 8823                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 8824                type_code=self.code_type_map.get("String"),
 8825            )
 8826            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 8827                id=f"{tag}S",
 8828                num=".",
 8829                type="String",
 8830                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 8831                type_code=self.code_type_map.get("String"),
 8832            )
 8833
 8834            # Update
 8835            # for sample in ped_samples:
 8836            sql_update_set = []
 8837            for sample in self.get_header_sample_list() + ["FORMAT"]:
 8838                if sample in ped_samples:
 8839                    value = f'dataframe_barcode."{barcode_infos}"'
 8840                    value_samples = "'" + ",".join(ped_samples) + "'"
 8841                elif sample == "FORMAT":
 8842                    value = f"'{tag}'"
 8843                    value_samples = f"'{tag}S'"
 8844                else:
 8845                    value = "'.'"
 8846                    value_samples = "'.'"
 8847                format_regex = r"[a-zA-Z0-9\s]"
 8848                sql_update_set.append(
 8849                    f"""
 8850                        "{sample}" = 
 8851                        concat(
 8852                            CASE
 8853                                WHEN {table_variants}."{sample}" = './.'
 8854                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 8855                                ELSE {table_variants}."{sample}"
 8856                            END,
 8857                            ':',
 8858                            {value},
 8859                            ':',
 8860                            {value_samples}
 8861                        )
 8862                    """
 8863                )
 8864
 8865            sql_update_set_join = ", ".join(sql_update_set)
 8866            sql_update = f"""
 8867                UPDATE {table_variants}
 8868                SET {sql_update_set_join}
 8869                FROM dataframe_barcode
 8870                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 8871            """
 8872            self.conn.execute(sql_update)
 8873
 8874            # Remove added columns
 8875            for added_column in added_columns:
 8876                self.drop_column(column=added_column)
 8877
 8878            # Delete dataframe
 8879            del dataframe_barcode
 8880            gc.collect()
 8881
 8882    def calculation_trio(self) -> None:
 8883        """
 8884        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 8885        information to the INFO field of each variant.
 8886        """
 8887
 8888        # if FORMAT and samples
 8889        if (
 8890            "FORMAT" in self.get_header_columns_as_list()
 8891            and self.get_header_sample_list()
 8892        ):
 8893
 8894            # trio annotation field
 8895            trio_tag = "trio"
 8896
 8897            # VCF infos tags
 8898            vcf_infos_tags = {
 8899                "trio": "trio calculation",
 8900            }
 8901
 8902            # Param
 8903            param = self.get_param()
 8904
 8905            # Prefix
 8906            prefix = self.get_explode_infos_prefix()
 8907
 8908            # Trio param
 8909            trio_ped = (
 8910                param.get("calculation", {})
 8911                .get("calculations", {})
 8912                .get("TRIO", {})
 8913                .get("trio_pedigree", None)
 8914            )
 8915
 8916            # Load trio
 8917            if trio_ped:
 8918
 8919                # Trio pedigree is a file
 8920                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 8921                    log.debug("TRIO pedigree is file")
 8922                    with open(full_path(trio_ped)) as trio_ped:
 8923                        trio_ped = json.load(trio_ped)
 8924
 8925                # Trio pedigree is a string
 8926                elif isinstance(trio_ped, str):
 8927                    log.debug("TRIO pedigree is str")
 8928                    try:
 8929                        trio_ped = json.loads(trio_ped)
 8930                        log.debug("TRIO pedigree is json str")
 8931                    except ValueError as e:
 8932                        trio_samples = trio_ped.split(",")
 8933                        if len(trio_samples) == 3:
 8934                            trio_ped = {
 8935                                "father": trio_samples[0],
 8936                                "mother": trio_samples[1],
 8937                                "child": trio_samples[2],
 8938                            }
 8939                            log.debug("TRIO pedigree is list str")
 8940                        else:
 8941                            msg_error = "TRIO pedigree not well formatted"
 8942                            log.error(msg_error)
 8943                            raise ValueError(msg_error)
 8944
 8945                # Trio pedigree is a dict
 8946                elif isinstance(trio_ped, dict):
 8947                    log.debug("TRIO pedigree is dict")
 8948
 8949                # Trio pedigree is not well formatted
 8950                else:
 8951                    msg_error = "TRIO pedigree not well formatted"
 8952                    log.error(msg_error)
 8953                    raise ValueError(msg_error)
 8954
 8955                # Construct trio list
 8956                trio_samples = [
 8957                    trio_ped.get("father", ""),
 8958                    trio_ped.get("mother", ""),
 8959                    trio_ped.get("child", ""),
 8960                ]
 8961
 8962            else:
 8963                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 8964                samples_list = self.get_header_sample_list()
 8965                if len(samples_list) >= 3:
 8966                    trio_samples = self.get_header_sample_list()[0:3]
 8967                    trio_ped = {
 8968                        "father": trio_samples[0],
 8969                        "mother": trio_samples[1],
 8970                        "child": trio_samples[2],
 8971                    }
 8972                else:
 8973                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 8974                    log.error(msg_error)
 8975                    raise ValueError(msg_error)
 8976
 8977            # Check trio pedigree
 8978            if not trio_ped or len(trio_ped) != 3:
 8979                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 8980                log.error(msg_error)
 8981                raise ValueError(msg_error)
 8982
 8983            # Log
 8984            log.info(
 8985                f"Calculation 'TRIO' - Samples: "
 8986                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 8987            )
 8988
 8989            # Field
 8990            trio_infos = prefix + trio_tag
 8991
 8992            # Variants table
 8993            table_variants = self.get_table_variants()
 8994
 8995            # Header
 8996            vcf_reader = self.get_header()
 8997
 8998            # Create variant id
 8999            variant_id_column = self.get_variant_id_column()
 9000            added_columns = [variant_id_column]
 9001
 9002            # variant_id, FORMAT and samples
 9003            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9004                self.get_header_sample_list()
 9005            )
 9006
 9007            # Create dataframe
 9008            dataframe_trio = self.get_query_to_df(
 9009                f""" SELECT {samples_fields} FROM {table_variants} """
 9010            )
 9011
 9012            # Create trio column
 9013            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9014                lambda row: trio(row, samples=trio_samples), axis=1
 9015            )
 9016
 9017            # Add trio to header
 9018            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9019                trio_tag,
 9020                ".",
 9021                "String",
 9022                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9023                "howard calculation",
 9024                "0",
 9025                self.code_type_map.get("String"),
 9026            )
 9027
 9028            # Update
 9029            sql_update = f"""
 9030                UPDATE {table_variants}
 9031                SET "INFO" = 
 9032                    concat(
 9033                        CASE
 9034                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9035                            THEN ''
 9036                            ELSE concat("INFO", ';')
 9037                        END,
 9038                        CASE
 9039                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9040                             AND dataframe_trio."{trio_infos}" NOT NULL
 9041                            THEN concat(
 9042                                    '{trio_tag}=',
 9043                                    dataframe_trio."{trio_infos}"
 9044                                )
 9045                            ELSE ''
 9046                        END
 9047                    )
 9048                FROM dataframe_trio
 9049                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9050            """
 9051            self.conn.execute(sql_update)
 9052
 9053            # Remove added columns
 9054            for added_column in added_columns:
 9055                self.drop_column(column=added_column)
 9056
 9057            # Delete dataframe
 9058            del dataframe_trio
 9059            gc.collect()
 9060
 9061    def calculation_vaf_normalization(self) -> None:
 9062        """
 9063        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9064        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9065        :return: The function does not return anything.
 9066        """
 9067
 9068        # if FORMAT and samples
 9069        if (
 9070            "FORMAT" in self.get_header_columns_as_list()
 9071            and self.get_header_sample_list()
 9072        ):
 9073
 9074            # vaf_normalization annotation field
 9075            vaf_normalization_tag = "VAF"
 9076
 9077            # VCF infos tags
 9078            vcf_infos_tags = {
 9079                "VAF": "VAF Variant Frequency",
 9080            }
 9081
 9082            # Prefix
 9083            prefix = self.get_explode_infos_prefix()
 9084
 9085            # Variants table
 9086            table_variants = self.get_table_variants()
 9087
 9088            # Header
 9089            vcf_reader = self.get_header()
 9090
 9091            # Do not calculate if VAF already exists
 9092            if "VAF" in vcf_reader.formats:
 9093                log.debug("VAF already on genotypes")
 9094                return
 9095
 9096            # Create variant id
 9097            variant_id_column = self.get_variant_id_column()
 9098            added_columns = [variant_id_column]
 9099
 9100            # variant_id, FORMAT and samples
 9101            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9102                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9103            )
 9104
 9105            # Create dataframe
 9106            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9107            log.debug(f"query={query}")
 9108            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9109
 9110            vaf_normalization_set = []
 9111
 9112            # for each sample vaf_normalization
 9113            for sample in self.get_header_sample_list():
 9114                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9115                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9116                )
 9117                vaf_normalization_set.append(
 9118                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9119                )
 9120
 9121            # Add VAF to FORMAT
 9122            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9123                "FORMAT"
 9124            ].apply(lambda x: str(x) + ":VAF")
 9125            vaf_normalization_set.append(
 9126                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9127            )
 9128
 9129            # Add vaf_normalization to header
 9130            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9131                id=vaf_normalization_tag,
 9132                num="1",
 9133                type="Float",
 9134                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9135                type_code=self.code_type_map.get("Float"),
 9136            )
 9137
 9138            # Create fields to add in INFO
 9139            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9140
 9141            # Update
 9142            sql_update = f"""
 9143                UPDATE {table_variants}
 9144                SET {sql_vaf_normalization_set}
 9145                FROM dataframe_vaf_normalization
 9146                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9147
 9148            """
 9149            self.conn.execute(sql_update)
 9150
 9151            # Remove added columns
 9152            for added_column in added_columns:
 9153                self.drop_column(column=added_column)
 9154
 9155            # Delete dataframe
 9156            del dataframe_vaf_normalization
 9157            gc.collect()
 9158
 9159    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9160        """
 9161        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9162        field in a VCF file and updates the INFO column of the variants table with the calculated
 9163        statistics.
 9164
 9165        :param info: The `info` parameter is a string that represents the type of information for which
 9166        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9167        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9168        maximum value, the mean, the median, defaults to VAF
 9169        :type info: str (optional)
 9170        """
 9171
 9172        # if FORMAT and samples
 9173        if (
 9174            "FORMAT" in self.get_header_columns_as_list()
 9175            and self.get_header_sample_list()
 9176        ):
 9177
 9178            # vaf_stats annotation field
 9179            vaf_stats_tag = info + "_stats"
 9180
 9181            # VCF infos tags
 9182            vcf_infos_tags = {
 9183                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9184                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9185                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9186                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9187                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9188                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9189                info
 9190                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9191            }
 9192
 9193            # Prefix
 9194            prefix = self.get_explode_infos_prefix()
 9195
 9196            # Field
 9197            vaf_stats_infos = prefix + vaf_stats_tag
 9198
 9199            # Variants table
 9200            table_variants = self.get_table_variants()
 9201
 9202            # Header
 9203            vcf_reader = self.get_header()
 9204
 9205            # Create variant id
 9206            variant_id_column = self.get_variant_id_column()
 9207            added_columns = [variant_id_column]
 9208
 9209            # variant_id, FORMAT and samples
 9210            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9211                self.get_header_sample_list()
 9212            )
 9213
 9214            # Create dataframe
 9215            dataframe_vaf_stats = self.get_query_to_df(
 9216                f""" SELECT {samples_fields} FROM {table_variants} """
 9217            )
 9218
 9219            # Create vaf_stats column
 9220            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9221                lambda row: genotype_stats(
 9222                    row, samples=self.get_header_sample_list(), info=info
 9223                ),
 9224                axis=1,
 9225            )
 9226
 9227            # List of vcf tags
 9228            sql_vaf_stats_fields = []
 9229
 9230            # Check all VAF stats infos
 9231            for stat in vcf_infos_tags:
 9232
 9233                # Extract stats
 9234                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9235                    lambda x: dict(x).get(stat, "")
 9236                )
 9237
 9238                # Add snpeff_hgvs to header
 9239                vcf_reader.infos[stat] = vcf.parser._Info(
 9240                    stat,
 9241                    ".",
 9242                    "String",
 9243                    vcf_infos_tags.get(stat, "genotype statistics"),
 9244                    "howard calculation",
 9245                    "0",
 9246                    self.code_type_map.get("String"),
 9247                )
 9248
 9249                if len(sql_vaf_stats_fields):
 9250                    sep = ";"
 9251                else:
 9252                    sep = ""
 9253
 9254                # Create fields to add in INFO
 9255                sql_vaf_stats_fields.append(
 9256                    f"""
 9257                        CASE
 9258                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9259                            THEN concat(
 9260                                    '{sep}{stat}=',
 9261                                    dataframe_vaf_stats."{stat}"
 9262                                )
 9263                            ELSE ''
 9264                        END
 9265                    """
 9266                )
 9267
 9268            # SQL set for update
 9269            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9270
 9271            # Update
 9272            sql_update = f"""
 9273                UPDATE {table_variants}
 9274                SET "INFO" = 
 9275                    concat(
 9276                        CASE
 9277                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9278                            THEN ''
 9279                            ELSE concat("INFO", ';')
 9280                        END,
 9281                        {sql_vaf_stats_fields_set}
 9282                    )
 9283                FROM dataframe_vaf_stats
 9284                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9285
 9286            """
 9287            self.conn.execute(sql_update)
 9288
 9289            # Remove added columns
 9290            for added_column in added_columns:
 9291                self.drop_column(column=added_column)
 9292
 9293            # Delete dataframe
 9294            del dataframe_vaf_stats
 9295            gc.collect()
 9296
 9297    def calculation_transcripts_json(self, info: str = "transcripts_json") -> None:
 9298        """
 9299        The function `calculation_transcripts_json` creates a transcripts table and adds an info field
 9300        to it if transcripts are available.
 9301
 9302        :param info: The `info` parameter in the `calculation_transcripts_json` method is a string
 9303        parameter that specifies the information field to be used in the transcripts JSON. It has a
 9304        default value of "transcripts_json" if no value is provided when calling the method, defaults to
 9305        transcripts_json
 9306        :type info: str (optional)
 9307        """
 9308
 9309        # Create transcripts table
 9310        transcripts_table = self.create_transcript_view()
 9311
 9312        # Add info field
 9313        if transcripts_table:
 9314            self.transcript_view_to_variants(
 9315                transcripts_table=transcripts_table, transcripts_info_field=info
 9316            )
 9317        else:
 9318            log.info("No Transcripts to process. Check param.json file configuration")
 9319
 9320    def calculation_transcripts_prioritization(self) -> None:
 9321        """
 9322        The function `calculation_transcripts_prioritization` creates a transcripts table and
 9323        prioritizes transcripts based on certain criteria.
 9324        """
 9325
 9326        # Create transcripts table
 9327        transcripts_table = self.create_transcript_view()
 9328
 9329        # Add info field
 9330        if transcripts_table:
 9331            self.transcripts_prioritization(transcripts_table=transcripts_table)
 9332        else:
 9333            log.info("No Transcripts to process. Check param.json file configuration")
 9334
 9335    ###############
 9336    # Transcripts #
 9337    ###############
 9338
 9339    def transcripts_prioritization(
 9340        self, transcripts_table: str = None, param: dict = {}
 9341    ) -> bool:
 9342        """
 9343        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
 9344        and updates the variants table with the prioritized information.
 9345
 9346        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
 9347        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
 9348        This parameter is used to identify the table where the transcripts data is stored for the
 9349        prioritization process
 9350        :type transcripts_table: str
 9351        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
 9352        that contains various configuration settings for the prioritization process of transcripts. It
 9353        is used to customize the behavior of the prioritization algorithm and includes settings such as
 9354        the prefix for prioritization fields, default profiles, and other
 9355        :type param: dict
 9356        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
 9357        transcripts prioritization process is successfully completed, and `False` if there are any
 9358        issues or if no profile is defined for transcripts prioritization.
 9359        """
 9360
 9361        log.debug("Start transcripts prioritization...")
 9362
 9363        # Param
 9364        if not param:
 9365            param = self.get_param()
 9366
 9367        # Variants table
 9368        table_variants = self.get_table_variants()
 9369        log.debug(f"transcripts_table={transcripts_table}")
 9370        # Transcripts table
 9371        if transcripts_table is None:
 9372            log.debug(f"transcripts_table={transcripts_table}")
 9373            transcripts_table = self.create_transcript_view(
 9374                transcripts_table="transcripts", param=param
 9375            )
 9376            log.debug(f"transcripts_table={transcripts_table}")
 9377        if transcripts_table is None:
 9378            msg_err = "No Transcripts table availalble"
 9379            log.error(msg_err)
 9380            raise ValueError(msg_err)
 9381
 9382        # Get transcripts columns
 9383        columns_as_list_query = f"""
 9384            DESCRIBE {transcripts_table}
 9385        """
 9386        columns_as_list = list(
 9387            self.get_query_to_df(columns_as_list_query)["column_name"]
 9388        )
 9389
 9390        # Create INFO if not exists
 9391        if "INFO" not in columns_as_list:
 9392            query_add_info = f"""
 9393                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
 9394            """
 9395            self.execute_query(query_add_info)
 9396
 9397        # Prioritization param and Force only PZ Score and Flag
 9398        pz_param = param.get("transcripts", {}).get("prioritization", {})
 9399        pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score"
 9400        pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag"
 9401        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
 9402        pz_param["pzfields"] = [pz_fields_score, pz_fields_flag]
 9403        pz_profile_default = (
 9404            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
 9405        )
 9406
 9407        # Exit if no profile
 9408        if pz_profile_default is None:
 9409            log.warning("No profile defined for transcripts prioritization")
 9410            return False
 9411
 9412        # Prioritization
 9413        prioritization_result = self.prioritization(
 9414            table=transcripts_table,
 9415            pz_param=param.get("transcripts", {}).get("prioritization", {}),
 9416        )
 9417        if not prioritization_result:
 9418            log.warning("Transcripts prioritization not processed")
 9419            return False
 9420
 9421        # Explode PZ fields
 9422        self.explode_infos(
 9423            table=transcripts_table,
 9424            fields=param.get("transcripts", {})
 9425            .get("prioritization", {})
 9426            .get("pzfields", []),
 9427        )
 9428
 9429        # Export Transcripts prioritization infos to variants table
 9430        query_update = f"""
 9431            WITH RankedTranscripts AS (
 9432                SELECT
 9433                    "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag},
 9434                    ROW_NUMBER() OVER (
 9435                        PARTITION BY "#CHROM", POS, REF, ALT
 9436                        ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC
 9437                    ) AS rn
 9438                FROM
 9439                    {transcripts_table}
 9440            )
 9441            UPDATE {table_variants}
 9442                SET
 9443                INFO = CONCAT(CASE
 9444                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9445                            THEN ''
 9446                            ELSE concat("INFO", ';')
 9447                        END,
 9448                        concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag})
 9449                        )
 9450            FROM
 9451                RankedTranscripts
 9452            WHERE
 9453                rn = 1
 9454                AND variants."#CHROM" = RankedTranscripts."#CHROM"
 9455                AND variants."POS" = RankedTranscripts."POS"
 9456                AND variants."REF" = RankedTranscripts."REF"
 9457                AND variants."ALT" = RankedTranscripts."ALT"
 9458                
 9459        """
 9460        self.execute_query(query=query_update)
 9461
 9462        # Add PZ Transcript in header
 9463        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
 9464            pz_fields_transcripts,
 9465            ".",
 9466            "String",
 9467            f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}",
 9468            "unknown",
 9469            "unknown",
 9470            code_type_map["String"],
 9471        )
 9472
 9473        # Return
 9474        return True
 9475
 9476    def create_transcript_view_from_columns_map(
 9477        self,
 9478        transcripts_table: str = "transcripts",
 9479        columns_maps: dict = {},
 9480        added_columns: list = [],
 9481        temporary_tables: list = None,
 9482        annotation_fields: list = None,
 9483    ) -> tuple[list, list, list]:
 9484        """
 9485        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
 9486        specified columns mapping for transcripts data.
 9487
 9488        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9489        the table where the transcripts data is stored or will be stored in the database. This table
 9490        typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores,
 9491        predictions, etc. It defaults to "transcripts, defaults to transcripts
 9492        :type transcripts_table: str (optional)
 9493        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about
 9494        how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list
 9495        represents a mapping configuration for a specific set of columns. It typically includes details such
 9496        as the main transcript column and additional information columns
 9497        :type columns_maps: dict
 9498        :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map`
 9499        function is a list that stores the additional columns that will be added to the view being created
 9500        based on the columns map provided. These columns are generated by exploding the transcript
 9501        information columns along with the main transcript column
 9502        :type added_columns: list
 9503        :param temporary_tables: The `temporary_tables` parameter in the
 9504        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
 9505        tables created during the process of creating a transcript view from a columns map. These temporary
 9506        tables are used to store intermediate results or transformations before the final view is generated
 9507        :type temporary_tables: list
 9508        :param annotation_fields: The `annotation_fields` parameter in the
 9509        `create_transcript_view_from_columns_map` function is a list that stores the fields that are used
 9510        for annotation in the query view creation process. These fields are extracted from the
 9511        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
 9512        :type annotation_fields: list
 9513        :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three
 9514        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
 9515        """
 9516
 9517        log.debug("Start transcrpts view creation from columns map...")
 9518
 9519        # "from_columns_map": [
 9520        #     {
 9521        #         "transcripts_column": "Ensembl_transcriptid",
 9522        #         "transcripts_infos_columns": [
 9523        #             "genename",
 9524        #             "Ensembl_geneid",
 9525        #             "LIST_S2_score",
 9526        #             "LIST_S2_pred",
 9527        #         ],
 9528        #     },
 9529        #     {
 9530        #         "transcripts_column": "Ensembl_transcriptid",
 9531        #         "transcripts_infos_columns": [
 9532        #             "genename",
 9533        #             "VARITY_R_score",
 9534        #             "Aloft_pred",
 9535        #         ],
 9536        #     },
 9537        # ],
 9538
 9539        # Init
 9540        if temporary_tables is None:
 9541            temporary_tables = []
 9542        if annotation_fields is None:
 9543            annotation_fields = []
 9544
 9545        # Variants table
 9546        table_variants = self.get_table_variants()
 9547
 9548        for columns_map in columns_maps:
 9549
 9550            # Transcript column
 9551            transcripts_column = columns_map.get("transcripts_column", None)
 9552
 9553            # Transcripts infos columns
 9554            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
 9555
 9556            if transcripts_column is not None:
 9557
 9558                # Explode
 9559                added_columns += self.explode_infos(
 9560                    fields=[transcripts_column] + transcripts_infos_columns
 9561                )
 9562
 9563                # View clauses
 9564                clause_select = []
 9565                for field in [transcripts_column] + transcripts_infos_columns:
 9566                    clause_select.append(
 9567                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
 9568                    )
 9569                    if field not in [transcripts_column]:
 9570                        annotation_fields.append(field)
 9571
 9572                # Querey View
 9573                query = f""" 
 9574                    SELECT
 9575                        "#CHROM", POS, REF, ALT,
 9576                        "{transcripts_column}" AS 'transcript',
 9577                        {", ".join(clause_select)}
 9578                    FROM (
 9579                        SELECT 
 9580                            "#CHROM", POS, REF, ALT,
 9581                            {", ".join(clause_select)}
 9582                        FROM {table_variants}
 9583                        )
 9584                    WHERE "{transcripts_column}" IS NOT NULL
 9585                """
 9586
 9587                # Create temporary table
 9588                temporary_table = transcripts_table + "".join(
 9589                    random.choices(string.ascii_uppercase + string.digits, k=10)
 9590                )
 9591
 9592                # Temporary_tables
 9593                temporary_tables.append(temporary_table)
 9594                query_view = f"""
 9595                    CREATE TEMPORARY TABLE {temporary_table}
 9596                    AS ({query})
 9597                """
 9598                self.execute_query(query=query_view)
 9599
 9600        return added_columns, temporary_tables, annotation_fields
 9601
 9602    def create_transcript_view_from_column_format(
 9603        self,
 9604        transcripts_table: str = "transcripts",
 9605        column_formats: dict = {},
 9606        temporary_tables: list = None,
 9607        annotation_fields: list = None,
 9608    ) -> tuple[list, list, list]:
 9609        """
 9610        The `create_transcript_view_from_column_format` function generates a transcript view based on
 9611        specified column formats, adds additional columns and annotation fields, and returns the list of
 9612        temporary tables and annotation fields.
 9613
 9614        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9615        the table containing the transcripts data. This table will be used as the base table for creating
 9616        the transcript view. The default value for this parameter is "transcripts", but you can provide a
 9617        different table name if needed, defaults to transcripts
 9618        :type transcripts_table: str (optional)
 9619        :param column_formats: The `column_formats` parameter is a dictionary that contains information
 9620        about the columns to be used for creating the transcript view. Each entry in the dictionary
 9621        specifies the mapping between a transcripts column and a transcripts infos column. For example, in
 9622        the provided code snippet:
 9623        :type column_formats: dict
 9624        :param temporary_tables: The `temporary_tables` parameter in the
 9625        `create_transcript_view_from_column_format` function is a list that stores the names of temporary
 9626        views created during the process of creating a transcript view from a column format. These temporary
 9627        views are used to manipulate and extract data before generating the final transcript view. It
 9628        :type temporary_tables: list
 9629        :param annotation_fields: The `annotation_fields` parameter in the
 9630        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
 9631        that are extracted from the temporary views created during the process. These annotation fields are
 9632        obtained by querying the temporary views and extracting the column names excluding specific columns
 9633        like `#CH
 9634        :type annotation_fields: list
 9635        :return: The `create_transcript_view_from_column_format` function returns two lists:
 9636        `temporary_tables` and `annotation_fields`.
 9637        """
 9638
 9639        log.debug("Start transcrpts view creation from column format...")
 9640
 9641        #  "from_column_format": [
 9642        #     {
 9643        #         "transcripts_column": "ANN",
 9644        #         "transcripts_infos_column": "Feature_ID",
 9645        #     }
 9646        # ],
 9647
 9648        # Init
 9649        if temporary_tables is None:
 9650            temporary_tables = []
 9651        if annotation_fields is None:
 9652            annotation_fields = []
 9653
 9654        for column_format in column_formats:
 9655
 9656            # annotation field and transcript annotation field
 9657            annotation_field = column_format.get("transcripts_column", "ANN")
 9658            transcript_annotation = column_format.get(
 9659                "transcripts_infos_column", "Feature_ID"
 9660            )
 9661
 9662            # Temporary View name
 9663            temporary_view_name = transcripts_table + "".join(
 9664                random.choices(string.ascii_uppercase + string.digits, k=10)
 9665            )
 9666
 9667            # Create temporary view name
 9668            temporary_view_name = self.annotation_format_to_table(
 9669                uniquify=True,
 9670                annotation_field=annotation_field,
 9671                view_name=temporary_view_name,
 9672                annotation_id=transcript_annotation,
 9673            )
 9674
 9675            # Annotation fields
 9676            if temporary_view_name:
 9677                query_annotation_fields = f"""
 9678                    SELECT *
 9679                    FROM (
 9680                        DESCRIBE SELECT *
 9681                        FROM {temporary_view_name}
 9682                        )
 9683                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
 9684                """
 9685                df_annotation_fields = self.get_query_to_df(
 9686                    query=query_annotation_fields
 9687                )
 9688
 9689                # Add temporary view and annotation fields
 9690                temporary_tables.append(temporary_view_name)
 9691                annotation_fields += list(set(df_annotation_fields["column_name"]))
 9692
 9693        return temporary_tables, annotation_fields
 9694
 9695    def create_transcript_view(
 9696        self,
 9697        transcripts_table: str = None,
 9698        transcripts_table_drop: bool = True,
 9699        param: dict = {},
 9700    ) -> str:
 9701        """
 9702        The `create_transcript_view` function generates a transcript view by processing data from a
 9703        specified table based on provided parameters and structural information.
 9704
 9705        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
 9706        is used to specify the name of the table that will store the final transcript view data. If a table
 9707        name is not provided, the function will create a new table to store the transcript view data, and by
 9708        default,, defaults to transcripts
 9709        :type transcripts_table: str (optional)
 9710        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
 9711        `create_transcript_view` function is a boolean parameter that determines whether to drop the
 9712        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
 9713        the function will drop the existing transcripts table if it exists, defaults to True
 9714        :type transcripts_table_drop: bool (optional)
 9715        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
 9716        contains information needed to create a transcript view. It includes details such as the structure
 9717        of the transcripts, columns mapping, column formats, and other necessary information for generating
 9718        the view. This parameter allows for flexibility and customization
 9719        :type param: dict
 9720        :return: The `create_transcript_view` function returns the name of the transcripts table that was
 9721        created or modified during the execution of the function.
 9722        """
 9723
 9724        log.debug("Start transcripts view creation...")
 9725
 9726        # Default
 9727        transcripts_table_default = "transcripts"
 9728
 9729        # Param
 9730        if not param:
 9731            param = self.get_param()
 9732
 9733        # Struct
 9734        struct = param.get("transcripts", {}).get("struct", None)
 9735
 9736        if struct:
 9737
 9738            # Transcripts table
 9739            if transcripts_table is None:
 9740                transcripts_table = param.get("transcripts", {}).get(
 9741                    "table", transcripts_table_default
 9742                )
 9743
 9744            # added_columns
 9745            added_columns = []
 9746
 9747            # Temporary tables
 9748            temporary_tables = []
 9749
 9750            # Annotation fields
 9751            annotation_fields = []
 9752
 9753            # from columns map
 9754            columns_maps = struct.get("from_columns_map", [])
 9755            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
 9756                self.create_transcript_view_from_columns_map(
 9757                    transcripts_table=transcripts_table,
 9758                    columns_maps=columns_maps,
 9759                    added_columns=added_columns,
 9760                    temporary_tables=temporary_tables,
 9761                    annotation_fields=annotation_fields,
 9762                )
 9763            )
 9764            added_columns += added_columns_tmp
 9765            temporary_tables += temporary_tables_tmp
 9766            annotation_fields += annotation_fields_tmp
 9767
 9768            # from column format
 9769            column_formats = struct.get("from_column_format", [])
 9770            temporary_tables_tmp, annotation_fields_tmp = (
 9771                self.create_transcript_view_from_column_format(
 9772                    transcripts_table=transcripts_table,
 9773                    column_formats=column_formats,
 9774                    temporary_tables=temporary_tables,
 9775                    annotation_fields=annotation_fields,
 9776                )
 9777            )
 9778            temporary_tables += temporary_tables_tmp
 9779            annotation_fields += annotation_fields_tmp
 9780
 9781            # Merge temporary tables query
 9782            query_merge = ""
 9783            for temporary_table in temporary_tables:
 9784
 9785                # First temporary table
 9786                if not query_merge:
 9787                    query_merge = f"""
 9788                        SELECT * FROM {temporary_table}
 9789                    """
 9790                # other temporary table (using UNION)
 9791                else:
 9792                    query_merge += f"""
 9793                        UNION BY NAME SELECT * FROM {temporary_table}
 9794                    """
 9795
 9796            # Merge on transcript
 9797            query_merge_on_transcripts_annotation_fields = []
 9798            # Aggregate all annotations fields
 9799            for annotation_field in set(annotation_fields):
 9800                query_merge_on_transcripts_annotation_fields.append(
 9801                    f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
 9802                )
 9803            # Query for transcripts view
 9804            query_merge_on_transcripts = f"""
 9805                SELECT "#CHROM", POS, REF, ALT, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
 9806                FROM ({query_merge})
 9807                GROUP BY "#CHROM", POS, REF, ALT, transcript
 9808            """
 9809
 9810            # Drop transcript view is necessary
 9811            if transcripts_table_drop:
 9812                query_drop = f"""
 9813                    DROP TABLE IF EXISTS {transcripts_table};
 9814                """
 9815                self.execute_query(query=query_drop)
 9816
 9817            # Merge and create transcript view
 9818            query_create_view = f"""
 9819                CREATE TABLE IF NOT EXISTS {transcripts_table}
 9820                AS {query_merge_on_transcripts}
 9821            """
 9822            self.execute_query(query=query_create_view)
 9823
 9824            # Remove added columns
 9825            for added_column in added_columns:
 9826                self.drop_column(column=added_column)
 9827
 9828        else:
 9829
 9830            transcripts_table = None
 9831
 9832        return transcripts_table
 9833
 9834    def annotation_format_to_table(
 9835        self,
 9836        uniquify: bool = True,
 9837        annotation_field: str = "ANN",
 9838        annotation_id: str = "Feature_ID",
 9839        view_name: str = "transcripts",
 9840    ) -> str:
 9841        """
 9842        The function `annotation_format_to_table` converts annotation data from a VCF file into a structured
 9843        table format.
 9844
 9845        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique
 9846        values in the output or not. If set to `True`, the function will make sure that the output values
 9847        are unique, defaults to True
 9848        :type uniquify: bool (optional)
 9849        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that
 9850        contains the annotation information for each variant. This field is used to extract the annotation
 9851        details for further processing in the function, defaults to ANN
 9852        :type annotation_field: str (optional)
 9853        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is
 9854        used to specify the identifier for the annotation feature. This identifier will be used as a column
 9855        name in the resulting table or view that is created based on the annotation data. It helps in
 9856        uniquely identifying each annotation entry in the, defaults to Feature_ID
 9857        :type annotation_id: str (optional)
 9858        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to
 9859        specify the name of the temporary table that will be created to store the transformed annotation
 9860        data. This table will hold the extracted information from the annotation field in a structured
 9861        format for further processing or analysis, defaults to transcripts
 9862        :type view_name: str (optional)
 9863        :return: The function `annotation_format_to_table` is returning the name of the view created, which
 9864        is stored in the variable `view_name`.
 9865        """
 9866
 9867        # Annotation field
 9868        annotation_format = "annotation_explode"
 9869
 9870        # Transcript annotation
 9871        annotation_id = "".join(char for char in annotation_id if char.isalnum())
 9872
 9873        # Prefix
 9874        prefix = self.get_explode_infos_prefix()
 9875        if prefix:
 9876            prefix = "INFO/"
 9877
 9878        # Annotation fields
 9879        annotation_infos = prefix + annotation_field
 9880        annotation_format_infos = prefix + annotation_format
 9881
 9882        # Variants table
 9883        table_variants = self.get_table_variants()
 9884
 9885        # Header
 9886        vcf_reader = self.get_header()
 9887
 9888        # Add columns
 9889        added_columns = []
 9890
 9891        # Explode HGVS field in column
 9892        added_columns += self.explode_infos(fields=[annotation_field])
 9893
 9894        if annotation_field in vcf_reader.infos:
 9895
 9896            # Extract ANN header
 9897            ann_description = vcf_reader.infos[annotation_field].desc
 9898            pattern = r"'(.+?)'"
 9899            match = re.search(pattern, ann_description)
 9900            if match:
 9901                ann_header_match = match.group(1).split(" | ")
 9902                ann_header = []
 9903                ann_header_desc = {}
 9904                for i in range(len(ann_header_match)):
 9905                    ann_header_info = "".join(
 9906                        char for char in ann_header_match[i] if char.isalnum()
 9907                    )
 9908                    ann_header.append(ann_header_info)
 9909                    ann_header_desc[ann_header_info] = ann_header_match[i]
 9910                if not ann_header_desc:
 9911                    raise ValueError("Invalid header description format")
 9912            else:
 9913                raise ValueError("Invalid header description format")
 9914
 9915            # Create variant id
 9916            variant_id_column = self.get_variant_id_column()
 9917            added_columns += [variant_id_column]
 9918
 9919            # Create dataframe
 9920            dataframe_annotation_format = self.get_query_to_df(
 9921                f""" SELECT "#CHROM", POS, REF, ALT, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
 9922            )
 9923
 9924            # Create annotation columns
 9925            dataframe_annotation_format[
 9926                annotation_format_infos
 9927            ] = dataframe_annotation_format[annotation_infos].apply(
 9928                lambda x: explode_annotation_format(
 9929                    annotation=str(x),
 9930                    uniquify=uniquify,
 9931                    output_format="JSON",
 9932                    prefix="",
 9933                    header=list(ann_header_desc.values()),
 9934                )
 9935            )
 9936
 9937            # Find keys
 9938            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
 9939            df_keys = self.get_query_to_df(query=query_json)
 9940
 9941            # Check keys
 9942            query_json_key = []
 9943            for _, row in df_keys.iterrows():
 9944
 9945                # Key
 9946                key = row.iloc[0]
 9947
 9948                # key_clean
 9949                key_clean = "".join(char for char in key if char.isalnum())
 9950
 9951                # Type
 9952                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
 9953
 9954                # Get DataFrame from query
 9955                df_json_type = self.get_query_to_df(query=query_json_type)
 9956
 9957                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
 9958                with pd.option_context("future.no_silent_downcasting", True):
 9959                    df_json_type.fillna(value="", inplace=True)
 9960                    replace_dict = {None: np.nan, "": np.nan}
 9961                    df_json_type.replace(replace_dict, inplace=True)
 9962                    df_json_type.dropna(inplace=True)
 9963
 9964                # Detect column type
 9965                column_type = detect_column_type(df_json_type[key_clean])
 9966
 9967                # Append
 9968                query_json_key.append(
 9969                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
 9970                )
 9971
 9972            # Create view
 9973            query_view = f"""CREATE TEMPORARY TABLE {view_name} AS (SELECT *, {annotation_id} AS 'transcript' FROM (SELECT "#CHROM", POS, REF, ALT, {",".join(query_json_key)} FROM dataframe_annotation_format));"""
 9974            self.execute_query(query=query_view)
 9975
 9976        else:
 9977
 9978            # Return None
 9979            view_name = None
 9980
 9981        # Remove added columns
 9982        for added_column in added_columns:
 9983            self.drop_column(column=added_column)
 9984
 9985        return view_name
 9986
 9987    def transcript_view_to_variants(
 9988        self,
 9989        transcripts_table: str = None,
 9990        transcripts_column_id: str = None,
 9991        transcripts_info_json: str = None,
 9992        transcripts_info_field: str = None,
 9993        param: dict = {},
 9994    ) -> bool:
 9995        """
 9996        The function `transcript_view_to_variants` takes input parameters related to transcripts and updates
 9997        a variants table with information from the transcripts in JSON format.
 9998
 9999        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the table
10000        containing the transcripts data. If this parameter is not provided, the function will attempt to
10001        retrieve it from the `param` dictionary or use a default value of "transcripts"
10002        :type transcripts_table: str
10003        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the column in
10004        the `transcripts_table` that contains the unique identifier for each transcript. This identifier is
10005        used to match transcripts with variants in the database
10006        :type transcripts_column_id: str
10007        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name of
10008        the column in the variants table where the transcripts information will be stored in JSON format
10009        :type transcripts_info_json: str
10010        :param transcripts_info_field: The `transcripts_info_field` parameter is used to specify the field
10011        in the VCF header that will contain information about transcripts in JSON format. This field will be
10012        added to the VCF header as an INFO field with the specified name
10013        :type transcripts_info_field: str
10014        :param param: The `transcript_view_to_variants` method takes several parameters:
10015        :type param: dict
10016        :return: The function `transcript_view_to_variants` returns a boolean value, which is `True` if the
10017        operation is successful and `False` if certain conditions are not met.
10018        """
10019
10020        log.debug("Start transcripts view to JSON...")
10021
10022        # Default
10023        transcripts_table_default = "transcripts"
10024        transcripts_column_id_default = "transcript"
10025        transcripts_info_json_default = None
10026        transcripts_info_field_default = None
10027
10028        # Param
10029        if not param:
10030            param = self.get_param()
10031
10032        # Transcripts table
10033        if transcripts_table is None:
10034            transcripts_table = param.get("transcripts", {}).get(
10035                "table", transcripts_table_default
10036            )
10037
10038        # Transcripts column ID
10039        if transcripts_column_id is None:
10040            transcripts_column_id = param.get("transcripts", {}).get(
10041                "column_id", transcripts_column_id_default
10042            )
10043
10044        # Transcripts info field
10045        if transcripts_info_json is None:
10046            transcripts_info_json = param.get("transcripts", {}).get(
10047                "transcripts_info_json", transcripts_info_json_default
10048            )
10049
10050        # Transcripts info field
10051        if transcripts_info_field is None:
10052            transcripts_info_field = param.get("transcripts", {}).get(
10053                "transcripts_info_field", transcripts_info_field_default
10054            )
10055
10056        # Variants table
10057        table_variants = self.get_table_variants()
10058
10059        # Check info columns param
10060        if transcripts_info_json is None and transcripts_info_field is None:
10061            return False
10062
10063        # Transcripts infos columns
10064        query_transcripts_infos_columns = f"""
10065            SELECT *
10066            FROM (
10067                DESCRIBE SELECT * FROM {transcripts_table}
10068                )
10069            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
10070        """
10071        transcripts_infos_columns = list(
10072            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
10073        )
10074
10075        # View results
10076        clause_select = []
10077        clause_to_json = []
10078        for field in transcripts_infos_columns:
10079            clause_select.append(
10080                f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10081            )
10082            clause_to_json.append(f""" '{field}': "{field}" """)
10083
10084        # Update
10085        update_set = []
10086
10087        # VCF header
10088        vcf_reader = self.get_header()
10089
10090        # Transcripts to info column in JSON
10091        if transcripts_info_json is not None:
10092
10093            # Create column on variants table
10094            self.add_column(
10095                table_name=table_variants,
10096                column_name=transcripts_info_json,
10097                column_type="JSON",
10098                default_value=None,
10099                drop=False,
10100            )
10101
10102            # Add to update
10103            update_set.append(
10104                f""" {transcripts_info_json}=t.{transcripts_info_json} """
10105            )
10106
10107            # Add header
10108            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
10109                transcripts_info_json,
10110                ".",
10111                "String",
10112                "Transcripts in JSON format",
10113                "unknwon",
10114                "unknwon",
10115                self.code_type_map["String"],
10116            )
10117
10118        # Transcripts to info field in JSON
10119        if transcripts_info_field is not None:
10120
10121            # Add to update
10122            update_set.append(
10123                f""" 
10124                    INFO = concat(
10125                            CASE
10126                                WHEN INFO NOT IN ('', '.')
10127                                THEN INFO
10128                                ELSE ''
10129                            END,
10130                            CASE
10131                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
10132                                THEN concat(
10133                                    ';{transcripts_info_field}=',
10134                                    t.{transcripts_info_json}
10135                                )
10136                                ELSE ''
10137                            END
10138                            )
10139                """
10140            )
10141
10142            # Add header
10143            vcf_reader.infos[transcripts_info_field] = vcf.parser._Info(
10144                transcripts_info_field,
10145                ".",
10146                "String",
10147                "Transcripts in JSON format",
10148                "unknwon",
10149                "unknwon",
10150                self.code_type_map["String"],
10151            )
10152
10153        # Update query
10154        query_update = f"""
10155            UPDATE {table_variants}
10156                SET {", ".join(update_set)}
10157            FROM
10158            (
10159                SELECT
10160                    "#CHROM", POS, REF, ALT,
10161                        concat(
10162                        '{{',
10163                        string_agg(
10164                            '"' || "{transcripts_column_id}" || '":' ||
10165                            to_json(json_output)
10166                        ),
10167                        '}}'
10168                        )::JSON AS {transcripts_info_json}
10169                FROM
10170                    (
10171                    SELECT
10172                        "#CHROM", POS, REF, ALT,
10173                        "{transcripts_column_id}",
10174                        to_json(
10175                            {{{",".join(clause_to_json)}}}
10176                        )::JSON AS json_output
10177                    FROM
10178                        (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10179                    WHERE "{transcripts_column_id}" IS NOT NULL
10180                    )
10181                GROUP BY "#CHROM", POS, REF, ALT
10182            ) AS t
10183            WHERE {table_variants}."#CHROM" = t."#CHROM"
10184                AND {table_variants}."POS" = t."POS"
10185                AND {table_variants}."REF" = t."REF"
10186                AND {table_variants}."ALT" = t."ALT"
10187        """
10188
10189        self.execute_query(query=query_update)
10190
10191        return True
Variants( conn=None, input: str = None, output: str = None, config: dict = {}, param: dict = {}, load: bool = False)
36    def __init__(
37        self,
38        conn=None,
39        input: str = None,
40        output: str = None,
41        config: dict = {},
42        param: dict = {},
43        load: bool = False,
44    ) -> None:
45        """
46        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
47        header
48
49        :param conn: the connection to the database
50        :param input: the input file
51        :param output: the output file
52        :param config: a dictionary containing the configuration of the model
53        :param param: a dictionary containing the parameters of the model
54        """
55
56        # Init variables
57        self.init_variables()
58
59        # Input
60        self.set_input(input)
61
62        # Config
63        self.set_config(config)
64
65        # Param
66        self.set_param(param)
67
68        # Output
69        self.set_output(output)
70
71        # connexion
72        self.set_connexion(conn)
73
74        # Header
75        self.set_header()
76
77        # Load data
78        if load:
79            self.load_data()

The function __init__ initializes the variables, sets the input, output, config, param, connexion and header

Parameters
  • conn: the connection to the database
  • input: the input file
  • output: the output file
  • config: a dictionary containing the configuration of the model
  • param: a dictionary containing the parameters of the model
def set_input(self, input: str = None) -> None:
 81    def set_input(self, input: str = None) -> None:
 82        """
 83        The function `set_input` takes a file name as input, extracts the name and extension, and sets
 84        attributes in the class accordingly.
 85
 86        :param input: The `set_input` method in the provided code snippet is used to set attributes
 87        related to the input file. Here's a breakdown of the parameters and their usage in the method:
 88        :type input: str
 89        """
 90
 91        if input and not isinstance(input, str):
 92            try:
 93                self.input = input.name
 94            except:
 95                log.error(f"Input file '{input} in bad format")
 96                raise ValueError(f"Input file '{input} in bad format")
 97        else:
 98            self.input = input
 99
100        # Input format
101        if input:
102            input_name, input_extension = os.path.splitext(self.input)
103            self.input_name = input_name
104            self.input_extension = input_extension
105            self.input_format = self.input_extension.replace(".", "")

The function set_input takes a file name as input, extracts the name and extension, and sets attributes in the class accordingly.

Parameters
  • input: The set_input method in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
def set_config(self, config: dict) -> None:
107    def set_config(self, config: dict) -> None:
108        """
109        The set_config function takes a config object and assigns it as the configuration object for the
110        class.
111
112        :param config: The `config` parameter in the `set_config` function is a dictionary object that
113        contains configuration settings for the class. When you call the `set_config` function with a
114        dictionary object as the argument, it will set that dictionary as the configuration object for
115        the class
116        :type config: dict
117        """
118
119        self.config = config

The set_config function takes a config object and assigns it as the configuration object for the class.

Parameters
  • config: The config parameter in the set_config function is a dictionary object that contains configuration settings for the class. When you call the set_config function with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
def set_param(self, param: dict) -> None:
121    def set_param(self, param: dict) -> None:
122        """
123        This function sets a parameter object for the class based on the input dictionary.
124
125        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
126        as the `param` attribute of the class instance
127        :type param: dict
128        """
129
130        self.param = param

This function sets a parameter object for the class based on the input dictionary.

Parameters
  • param: The set_param method you provided takes a dictionary object as input and sets it as the param attribute of the class instance
def init_variables(self) -> None:
132    def init_variables(self) -> None:
133        """
134        This function initializes the variables that will be used in the rest of the class
135        """
136
137        self.prefix = "howard"
138        self.table_variants = "variants"
139        self.dataframe = None
140
141        self.comparison_map = {
142            "gt": ">",
143            "gte": ">=",
144            "lt": "<",
145            "lte": "<=",
146            "equals": "=",
147            "contains": "SIMILAR TO",
148        }
149
150        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
151
152        self.code_type_map_to_sql = {
153            "Integer": "INTEGER",
154            "String": "VARCHAR",
155            "Float": "FLOAT",
156            "Flag": "VARCHAR",
157        }
158
159        self.index_additionnal_fields = []

This function initializes the variables that will be used in the rest of the class

def get_indexing(self) -> bool:
161    def get_indexing(self) -> bool:
162        """
163        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
164        returns False.
165        :return: The value of the indexing parameter.
166        """
167
168        return self.get_param().get("indexing", False)

It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.

Returns

The value of the indexing parameter.

def get_connexion_config(self) -> dict:
170    def get_connexion_config(self) -> dict:
171        """
172        The function `get_connexion_config` returns a dictionary containing the configuration for a
173        connection, including the number of threads and memory limit.
174        :return: a dictionary containing the configuration for the Connexion library.
175        """
176
177        # config
178        config = self.get_config()
179
180        # Connexion config
181        connexion_config = {}
182        threads = self.get_threads()
183
184        # Threads
185        if threads:
186            connexion_config["threads"] = threads
187
188        # Memory
189        # if config.get("memory", None):
190        #     connexion_config["memory_limit"] = config.get("memory")
191        if self.get_memory():
192            connexion_config["memory_limit"] = self.get_memory()
193
194        # Temporary directory
195        if config.get("tmp", None):
196            connexion_config["temp_directory"] = config.get("tmp")
197
198        # Access
199        if config.get("access", None):
200            access = config.get("access")
201            if access in ["RO"]:
202                access = "READ_ONLY"
203            elif access in ["RW"]:
204                access = "READ_WRITE"
205            connexion_db = self.get_connexion_db()
206            if connexion_db in ":memory:":
207                access = "READ_WRITE"
208            connexion_config["access_mode"] = access
209
210        return connexion_config

The function get_connexion_config returns a dictionary containing the configuration for a connection, including the number of threads and memory limit.

Returns

a dictionary containing the configuration for the Connexion library.

def get_duckdb_settings(self) -> dict:
212    def get_duckdb_settings(self) -> dict:
213        """
214        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
215        string.
216        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
217        """
218
219        # config
220        config = self.get_config()
221
222        # duckdb settings
223        duckdb_settings_dict = {}
224        if config.get("duckdb_settings", None):
225            duckdb_settings = config.get("duckdb_settings")
226            duckdb_settings = full_path(duckdb_settings)
227            # duckdb setting is a file
228            if os.path.exists(duckdb_settings):
229                with open(duckdb_settings) as json_file:
230                    duckdb_settings_dict = yaml.safe_load(json_file)
231            # duckdb settings is a string
232            else:
233                duckdb_settings_dict = json.loads(duckdb_settings)
234
235        return duckdb_settings_dict

The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a string.

Returns

The function get_duckdb_settings returns a dictionary object duckdb_settings_dict.

def set_connexion_db(self) -> str:
237    def set_connexion_db(self) -> str:
238        """
239        The function `set_connexion_db` returns the appropriate database connection string based on the
240        input format and connection type.
241        :return: the value of the variable `connexion_db`.
242        """
243
244        # Default connexion db
245        default_connexion_db = ":memory:"
246
247        # Find connexion db
248        if self.get_input_format() in ["db", "duckdb"]:
249            connexion_db = self.get_input()
250        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
251            connexion_db = default_connexion_db
252        elif self.get_connexion_type() in ["tmpfile"]:
253            tmp_name = tempfile.mkdtemp(
254                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
255            )
256            connexion_db = f"{tmp_name}/tmp.db"
257        elif self.get_connexion_type() != "":
258            connexion_db = self.get_connexion_type()
259        else:
260            connexion_db = default_connexion_db
261
262        # Set connexion db
263        self.connexion_db = connexion_db
264
265        return connexion_db

The function set_connexion_db returns the appropriate database connection string based on the input format and connection type.

Returns

the value of the variable connexion_db.

def set_connexion(self, conn) -> None:
267    def set_connexion(self, conn) -> None:
268        """
269        The function `set_connexion` creates a connection to a database, with options for different
270        database formats and settings.
271
272        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
273        database. If a connection is not provided, a new connection to an in-memory database is created.
274        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
275        sqlite
276        """
277
278        # Connexion db
279        connexion_db = self.set_connexion_db()
280
281        # Connexion config
282        connexion_config = self.get_connexion_config()
283
284        # Connexion format
285        connexion_format = self.get_config().get("connexion_format", "duckdb")
286        # Set connexion format
287        self.connexion_format = connexion_format
288
289        # Connexion
290        if not conn:
291            if connexion_format in ["duckdb"]:
292                conn = duckdb.connect(connexion_db, config=connexion_config)
293                # duckDB settings
294                duckdb_settings = self.get_duckdb_settings()
295                if duckdb_settings:
296                    for setting in duckdb_settings:
297                        setting_value = duckdb_settings.get(setting)
298                        if isinstance(setting_value, str):
299                            setting_value = f"'{setting_value}'"
300                        conn.execute(f"PRAGMA {setting}={setting_value};")
301            elif connexion_format in ["sqlite"]:
302                conn = sqlite3.connect(connexion_db)
303
304        # Set connexion
305        self.conn = conn
306
307        # Log
308        log.debug(f"connexion_format: {connexion_format}")
309        log.debug(f"connexion_db: {connexion_db}")
310        log.debug(f"connexion config: {connexion_config}")
311        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")

The function set_connexion creates a connection to a database, with options for different database formats and settings.

Parameters
  • conn: The conn parameter in the set_connexion method is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
def set_output(self, output: str = None) -> None:
313    def set_output(self, output: str = None) -> None:
314        """
315        The `set_output` function in Python sets the output file based on the input or a specified key
316        in the config file, extracting the output name, extension, and format.
317
318        :param output: The `output` parameter in the `set_output` method is used to specify the name of
319        the output file. If the config file has an 'output' key, the method sets the output to the value
320        of that key. If no output is provided, it sets the output to `None`
321        :type output: str
322        """
323
324        if output and not isinstance(output, str):
325            self.output = output.name
326        else:
327            self.output = output
328
329        # Output format
330        if self.output:
331            output_name, output_extension = os.path.splitext(self.output)
332            self.output_name = output_name
333            self.output_extension = output_extension
334            self.output_format = self.output_extension.replace(".", "")
335        else:
336            self.output_name = None
337            self.output_extension = None
338            self.output_format = None

The set_output function in Python sets the output file based on the input or a specified key in the config file, extracting the output name, extension, and format.

Parameters
  • output: The output parameter in the set_output method is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output to None
def set_header(self) -> None:
340    def set_header(self) -> None:
341        """
342        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
343        """
344
345        input_file = self.get_input()
346        default_header_list = [
347            "##fileformat=VCFv4.2",
348            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
349        ]
350
351        # Full path
352        input_file = full_path(input_file)
353
354        if input_file:
355
356            input_format = self.get_input_format()
357            input_compressed = self.get_input_compressed()
358            config = self.get_config()
359            header_list = default_header_list
360            if input_format in [
361                "vcf",
362                "hdr",
363                "tsv",
364                "csv",
365                "psv",
366                "parquet",
367                "db",
368                "duckdb",
369            ]:
370                # header provided in param
371                if config.get("header_file", None):
372                    with open(config.get("header_file"), "rt") as f:
373                        header_list = self.read_vcf_header(f)
374                # within a vcf file format (header within input file itsself)
375                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
376                    # within a compressed vcf file format (.vcf.gz)
377                    if input_compressed:
378                        with bgzf.open(input_file, "rt") as f:
379                            header_list = self.read_vcf_header(f)
380                    # within an uncompressed vcf file format (.vcf)
381                    else:
382                        with open(input_file, "rt") as f:
383                            header_list = self.read_vcf_header(f)
384                # header provided in default external file .hdr
385                elif os.path.exists((input_file + ".hdr")):
386                    with open(input_file + ".hdr", "rt") as f:
387                        header_list = self.read_vcf_header(f)
388                else:
389                    try:  # Try to get header info fields and file columns
390
391                        with tempfile.TemporaryDirectory() as tmpdir:
392
393                            # Create database
394                            db_for_header = Database(database=input_file)
395
396                            # Get header columns for infos fields
397                            db_header_from_columns = (
398                                db_for_header.get_header_from_columns()
399                            )
400
401                            # Get real columns in the file
402                            db_header_columns = db_for_header.get_columns()
403
404                            # Write header file
405                            header_file_tmp = os.path.join(tmpdir, "header")
406                            f = open(header_file_tmp, "w")
407                            vcf.Writer(f, db_header_from_columns)
408                            f.close()
409
410                            # Replace #CHROM line with rel columns
411                            header_list = db_for_header.read_header_file(
412                                header_file=header_file_tmp
413                            )
414                            header_list[-1] = "\t".join(db_header_columns)
415
416                    except:
417
418                        log.warning(
419                            f"No header for file {input_file}. Set as default VCF header"
420                        )
421                        header_list = default_header_list
422
423            else:  # try for unknown format ?
424
425                log.error(f"Input file format '{input_format}' not available")
426                raise ValueError(f"Input file format '{input_format}' not available")
427
428            if not header_list:
429                header_list = default_header_list
430
431            # header as list
432            self.header_list = header_list
433
434            # header as VCF object
435            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
436
437        else:
438
439            self.header_list = None
440            self.header_vcf = None

It reads the header of a VCF file and stores it as a list of strings and as a VCF object

def get_query_to_df(self, query: str = '', limit: int = None) -> pandas.core.frame.DataFrame:
442    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
443        """
444        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
445        DataFrame based on the connection format.
446
447        :param query: The `query` parameter in the `get_query_to_df` function is a string that
448        represents the SQL query you want to execute. This query will be used to fetch data from a
449        database and convert it into a pandas DataFrame
450        :type query: str
451        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
452        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
453        function will only fetch up to that number of rows from the database query result. If no limit
454        is specified,
455        :type limit: int
456        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
457        """
458
459        # Connexion format
460        connexion_format = self.get_connexion_format()
461
462        # Limit in query
463        if limit:
464            pd.set_option("display.max_rows", limit)
465            if connexion_format in ["duckdb"]:
466                df = (
467                    self.conn.execute(query)
468                    .fetch_record_batch(limit)
469                    .read_next_batch()
470                    .to_pandas()
471                )
472            elif connexion_format in ["sqlite"]:
473                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
474
475        # Full query
476        else:
477            if connexion_format in ["duckdb"]:
478                df = self.conn.execute(query).df()
479            elif connexion_format in ["sqlite"]:
480                df = pd.read_sql_query(query, self.conn)
481
482        return df

The get_query_to_df function takes a query as a string and returns the result as a pandas DataFrame based on the connection format.

Parameters
  • query: The query parameter in the get_query_to_df function is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame
  • limit: The limit parameter in the get_query_to_df function is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns

A pandas DataFrame is being returned by the get_query_to_df function.

def get_overview(self) -> None:
484    def get_overview(self) -> None:
485        """
486        The function prints the input, output, config, and dataframe of the current object
487        """
488        table_variants_from = self.get_table_variants(clause="from")
489        sql_columns = self.get_header_columns_as_sql()
490        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
491        df = self.get_query_to_df(sql_query_export)
492        log.info(
493            "Input:  "
494            + str(self.get_input())
495            + " ["
496            + str(str(self.get_input_format()))
497            + "]"
498        )
499        log.info(
500            "Output: "
501            + str(self.get_output())
502            + " ["
503            + str(str(self.get_output_format()))
504            + "]"
505        )
506        log.info("Config: ")
507        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
508            "\n"
509        ):
510            log.info("\t" + str(d))
511        log.info("Param: ")
512        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
513            "\n"
514        ):
515            log.info("\t" + str(d))
516        log.info("Sample list: " + str(self.get_header_sample_list()))
517        log.info("Dataframe: ")
518        for d in str(df).split("\n"):
519            log.info("\t" + str(d))
520
521        # garbage collector
522        del df
523        gc.collect()
524
525        return None

The function prints the input, output, config, and dataframe of the current object

def get_stats(self) -> dict:
527    def get_stats(self) -> dict:
528        """
529        The `get_stats` function calculates and returns various statistics of the current object,
530        including information about the input file, variants, samples, header fields, quality, and
531        SNVs/InDels.
532        :return: a dictionary containing various statistics of the current object. The dictionary has
533        the following structure:
534        """
535
536        # Log
537        log.info(f"Stats Calculation...")
538
539        # table varaints
540        table_variants_from = self.get_table_variants()
541
542        # stats dict
543        stats = {"Infos": {}}
544
545        ### File
546        input_file = self.get_input()
547        stats["Infos"]["Input file"] = input_file
548
549        # Header
550        header_infos = self.get_header().infos
551        header_formats = self.get_header().formats
552        header_infos_list = list(header_infos)
553        header_formats_list = list(header_formats)
554
555        ### Variants
556
557        stats["Variants"] = {}
558
559        # Variants by chr
560        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
561        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
562        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
563            by=["CHROM"], kind="quicksort"
564        )
565
566        # Total number of variants
567        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
568
569        # Calculate percentage
570        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
571            lambda x: (x / nb_of_variants)
572        )
573
574        stats["Variants"]["Number of variants by chromosome"] = (
575            nb_of_variants_by_chrom.to_dict(orient="index")
576        )
577
578        stats["Infos"]["Number of variants"] = int(nb_of_variants)
579
580        ### Samples
581
582        # Init
583        samples = {}
584        nb_of_samples = 0
585
586        # Check Samples
587        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
588            log.debug(f"Check samples...")
589            for sample in self.get_header_sample_list():
590                sql_query_samples = f"""
591                    SELECT  '{sample}' as sample,
592                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
593                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
594                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
595                    FROM {table_variants_from}
596                    WHERE (
597                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
598                        AND
599                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
600                      )
601                    GROUP BY genotype
602                    """
603                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
604                sample_genotype_count = sql_query_genotype_df["count"].sum()
605                if len(sql_query_genotype_df):
606                    nb_of_samples += 1
607                    samples[f"{sample} - {sample_genotype_count} variants"] = (
608                        sql_query_genotype_df.to_dict(orient="index")
609                    )
610
611            stats["Samples"] = samples
612            stats["Infos"]["Number of samples"] = nb_of_samples
613
614        # #
615        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
616        #     stats["Infos"]["Number of samples"] = nb_of_samples
617        # elif nb_of_samples:
618        #     stats["Infos"]["Number of samples"] = "not a VCF format"
619
620        ### INFO and FORMAT fields
621        header_types_df = {}
622        header_types_list = {
623            "List of INFO fields": header_infos,
624            "List of FORMAT fields": header_formats,
625        }
626        i = 0
627        for header_type in header_types_list:
628
629            header_type_infos = header_types_list.get(header_type)
630            header_infos_dict = {}
631
632            for info in header_type_infos:
633
634                i += 1
635                header_infos_dict[i] = {}
636
637                # ID
638                header_infos_dict[i]["id"] = info
639
640                # num
641                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
642                if header_type_infos[info].num in genotype_map.keys():
643                    header_infos_dict[i]["Number"] = genotype_map.get(
644                        header_type_infos[info].num
645                    )
646                else:
647                    header_infos_dict[i]["Number"] = header_type_infos[info].num
648
649                # type
650                if header_type_infos[info].type:
651                    header_infos_dict[i]["Type"] = header_type_infos[info].type
652                else:
653                    header_infos_dict[i]["Type"] = "."
654
655                # desc
656                if header_type_infos[info].desc != None:
657                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
658                else:
659                    header_infos_dict[i]["Description"] = ""
660
661            if len(header_infos_dict):
662                header_types_df[header_type] = pd.DataFrame.from_dict(
663                    header_infos_dict, orient="index"
664                ).to_dict(orient="index")
665
666        # Stats
667        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
668        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
669        stats["Header"] = header_types_df
670
671        ### QUAL
672        if "QUAL" in self.get_header_columns():
673            sql_query_qual = f"""
674                    SELECT
675                        avg(CAST(QUAL AS INTEGER)) AS Average,
676                        min(CAST(QUAL AS INTEGER)) AS Minimum,
677                        max(CAST(QUAL AS INTEGER)) AS Maximum,
678                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
679                        median(CAST(QUAL AS INTEGER)) AS Median,
680                        variance(CAST(QUAL AS INTEGER)) AS Variance
681                    FROM {table_variants_from}
682                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
683                    """
684
685            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
686            stats["Quality"] = {"Stats": qual}
687
688        ### SNV and InDel
689
690        sql_query_snv = f"""
691            
692            SELECT Type, count FROM (
693
694                    SELECT
695                        'Total' AS Type,
696                        count(*) AS count
697                    FROM {table_variants_from}
698
699                    UNION
700
701                    SELECT
702                        'MNV' AS Type,
703                        count(*) AS count
704                    FROM {table_variants_from}
705                    WHERE len(REF) > 1 AND len(ALT) > 1
706                    AND len(REF) = len(ALT)
707
708                    UNION
709
710                    SELECT
711                        'InDel' AS Type,
712                        count(*) AS count
713                    FROM {table_variants_from}
714                    WHERE len(REF) > 1 OR len(ALT) > 1
715                    AND len(REF) != len(ALT)
716                    
717                    UNION
718
719                    SELECT
720                        'SNV' AS Type,
721                        count(*) AS count
722                    FROM {table_variants_from}
723                    WHERE len(REF) = 1 AND len(ALT) = 1
724
725                )
726
727            ORDER BY count DESC
728
729                """
730        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
731
732        sql_query_snv_substitution = f"""
733                SELECT
734                    concat(REF, '>', ALT) AS 'Substitution',
735                    count(*) AS count
736                FROM {table_variants_from}
737                WHERE len(REF) = 1 AND len(ALT) = 1
738                GROUP BY REF, ALT
739                ORDER BY count(*) DESC
740                """
741        snv_substitution = (
742            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
743        )
744        stats["Variants"]["Counts"] = snv_indel
745        stats["Variants"]["Substitutions"] = snv_substitution
746
747        return stats

The get_stats function calculates and returns various statistics of the current object, including information about the input file, variants, samples, header fields, quality, and SNVs/InDels.

Returns

a dictionary containing various statistics of the current object. The dictionary has the following structure:

def stats_to_file(self, file: str = None) -> str:
749    def stats_to_file(self, file: str = None) -> str:
750        """
751        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
752        into a JSON object, and writes the JSON object to the specified file.
753
754        :param file: The `file` parameter is a string that represents the file path where the JSON data
755        will be written
756        :type file: str
757        :return: the name of the file that was written to.
758        """
759
760        # Get stats
761        stats = self.get_stats()
762
763        # Serializing json
764        json_object = json.dumps(stats, indent=4)
765
766        # Writing to sample.json
767        with open(file, "w") as outfile:
768            outfile.write(json_object)
769
770        return file

The function stats_to_file takes a file name as input, retrieves statistics, serializes them into a JSON object, and writes the JSON object to the specified file.

Parameters
  • file: The file parameter is a string that represents the file path where the JSON data will be written
Returns

the name of the file that was written to.

def print_stats(self, output_file: str = None, json_file: str = None) -> None:
772    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
773        """
774        The `print_stats` function generates a markdown file and prints the statistics contained in a
775        JSON file in a formatted manner.
776
777        :param output_file: The `output_file` parameter is a string that specifies the path and filename
778        of the output file where the stats will be printed in Markdown format. If no `output_file` is
779        provided, a temporary directory will be created and the stats will be saved in a file named
780        "stats.md" within that
781        :type output_file: str
782        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
783        file where the statistics will be saved. If no value is provided, a temporary directory will be
784        created and a default file name "stats.json" will be used
785        :type json_file: str
786        :return: The function `print_stats` does not return any value. It has a return type annotation
787        of `None`.
788        """
789
790        # Full path
791        output_file = full_path(output_file)
792        json_file = full_path(json_file)
793
794        with tempfile.TemporaryDirectory() as tmpdir:
795
796            # Files
797            if not output_file:
798                output_file = os.path.join(tmpdir, "stats.md")
799            if not json_file:
800                json_file = os.path.join(tmpdir, "stats.json")
801
802            # Create folders
803            if not os.path.exists(os.path.dirname(output_file)):
804                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
805            if not os.path.exists(os.path.dirname(json_file)):
806                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
807
808            # Create stats JSON file
809            stats_file = self.stats_to_file(file=json_file)
810
811            # Print stats file
812            with open(stats_file) as f:
813                stats = yaml.safe_load(f)
814
815            # Output
816            output_title = []
817            output_index = []
818            output = []
819
820            # Title
821            output_title.append("# HOWARD Stats")
822
823            # Index
824            output_index.append("## Index")
825
826            # Process sections
827            for section in stats:
828                infos = stats.get(section)
829                section_link = "#" + section.lower().replace(" ", "-")
830                output.append(f"## {section}")
831                output_index.append(f"- [{section}]({section_link})")
832
833                if len(infos):
834                    for info in infos:
835                        try:
836                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
837                            is_df = True
838                        except:
839                            try:
840                                df = pd.DataFrame.from_dict(
841                                    json.loads((infos.get(info))), orient="index"
842                                )
843                                is_df = True
844                            except:
845                                is_df = False
846                        if is_df:
847                            output.append(f"### {info}")
848                            info_link = "#" + info.lower().replace(" ", "-")
849                            output_index.append(f"   - [{info}]({info_link})")
850                            output.append(f"{df.to_markdown(index=False)}")
851                        else:
852                            output.append(f"- {info}: {infos.get(info)}")
853                else:
854                    output.append(f"NA")
855
856            # Write stats in markdown file
857            with open(output_file, "w") as fp:
858                for item in output_title:
859                    fp.write("%s\n" % item)
860                for item in output_index:
861                    fp.write("%s\n" % item)
862                for item in output:
863                    fp.write("%s\n" % item)
864
865            # Output stats in markdown
866            print("")
867            print("\n\n".join(output_title))
868            print("")
869            print("\n\n".join(output))
870            print("")
871
872        return None

The print_stats function generates a markdown file and prints the statistics contained in a JSON file in a formatted manner.

Parameters
  • output_file: The output_file parameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If no output_file is provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that
  • json_file: The json_file parameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns

The function print_stats does not return any value. It has a return type annotation of None.

def get_input(self) -> str:
874    def get_input(self) -> str:
875        """
876        It returns the value of the input variable.
877        :return: The input is being returned.
878        """
879        return self.input

It returns the value of the input variable.

Returns

The input is being returned.

def get_input_format(self, input_file: str = None) -> str:
881    def get_input_format(self, input_file: str = None) -> str:
882        """
883        This function returns the format of the input variable, either from the provided input file or
884        by prompting for input.
885
886        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
887        represents the file path of the input file. If no `input_file` is provided when calling the
888        method, it will default to `None`
889        :type input_file: str
890        :return: The format of the input variable is being returned.
891        """
892
893        if not input_file:
894            input_file = self.get_input()
895        input_format = get_file_format(input_file)
896        return input_format

This function returns the format of the input variable, either from the provided input file or by prompting for input.

Parameters
  • input_file: The input_file parameter in the get_input_format method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None
Returns

The format of the input variable is being returned.

def get_input_compressed(self, input_file: str = None) -> str:
898    def get_input_compressed(self, input_file: str = None) -> str:
899        """
900        The function `get_input_compressed` returns the format of the input variable after compressing
901        it.
902
903        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
904        that represents the file path of the input file. If no `input_file` is provided when calling the
905        method, it will default to `None` and the method will then call `self.get_input()` to
906        :type input_file: str
907        :return: The function `get_input_compressed` returns the compressed format of the input
908        variable.
909        """
910
911        if not input_file:
912            input_file = self.get_input()
913        input_compressed = get_file_compressed(input_file)
914        return input_compressed

The function get_input_compressed returns the format of the input variable after compressing it.

Parameters
  • input_file: The input_file parameter in the get_input_compressed method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None and the method will then call self.get_input() to
Returns

The function get_input_compressed returns the compressed format of the input variable.

def get_output(self) -> str:
916    def get_output(self) -> str:
917        """
918        It returns the output of the neuron.
919        :return: The output of the neural network.
920        """
921
922        return self.output

It returns the output of the neuron.

Returns

The output of the neural network.

def get_output_format(self, output_file: str = None) -> str:
924    def get_output_format(self, output_file: str = None) -> str:
925        """
926        The function `get_output_format` returns the format of the input variable or the output file if
927        provided.
928
929        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
930        that represents the file path of the output file. If no `output_file` is provided when calling
931        the method, it will default to the output obtained from the `get_output` method of the class
932        instance. The
933        :type output_file: str
934        :return: The format of the input variable is being returned.
935        """
936
937        if not output_file:
938            output_file = self.get_output()
939        output_format = get_file_format(output_file)
940
941        return output_format

The function get_output_format returns the format of the input variable or the output file if provided.

Parameters
  • output_file: The output_file parameter in the get_output_format method is a string that represents the file path of the output file. If no output_file is provided when calling the method, it will default to the output obtained from the get_output method of the class instance. The
Returns

The format of the input variable is being returned.

def get_config(self) -> dict:
943    def get_config(self) -> dict:
944        """
945        It returns the config
946        :return: The config variable is being returned.
947        """
948        return self.config

It returns the config

Returns

The config variable is being returned.

def get_param(self) -> dict:
950    def get_param(self) -> dict:
951        """
952        It returns the param
953        :return: The param variable is being returned.
954        """
955        return self.param

It returns the param

Returns

The param variable is being returned.

def get_connexion_db(self) -> str:
957    def get_connexion_db(self) -> str:
958        """
959        It returns the connexion_db attribute of the object
960        :return: The connexion_db is being returned.
961        """
962        return self.connexion_db

It returns the connexion_db attribute of the object

Returns

The connexion_db is being returned.

def get_prefix(self) -> str:
964    def get_prefix(self) -> str:
965        """
966        It returns the prefix of the object.
967        :return: The prefix is being returned.
968        """
969        return self.prefix

It returns the prefix of the object.

Returns

The prefix is being returned.

def get_table_variants(self, clause: str = 'select') -> str:
971    def get_table_variants(self, clause: str = "select") -> str:
972        """
973        This function returns the table_variants attribute of the object
974
975        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
976        defaults to select (optional)
977        :return: The table_variants attribute of the object.
978        """
979
980        # Access
981        access = self.get_config().get("access", None)
982
983        # Clauses "select", "where", "update"
984        if clause in ["select", "where", "update"]:
985            table_variants = self.table_variants
986        # Clause "from"
987        elif clause in ["from"]:
988            # For Read Only
989            if self.get_input_format() in ["parquet"] and access in ["RO"]:
990                input_file = self.get_input()
991                table_variants = f"'{input_file}' as variants"
992            # For Read Write
993            else:
994                table_variants = f"{self.table_variants} as variants"
995        else:
996            table_variants = self.table_variants
997        return table_variants

This function returns the table_variants attribute of the object

Parameters
  • clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns

The table_variants attribute of the object.

def get_tmp_dir(self) -> str:
 999    def get_tmp_dir(self) -> str:
1000        """
1001        The function `get_tmp_dir` returns the temporary directory path based on configuration
1002        parameters or a default path.
1003        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
1004        configuration, parameters, and a default value of "/tmp".
1005        """
1006
1007        return get_tmp(
1008            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
1009        )

The function get_tmp_dir returns the temporary directory path based on configuration parameters or a default path.

Returns

The get_tmp_dir method is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".

def get_connexion_type(self) -> str:
1011    def get_connexion_type(self) -> str:
1012        """
1013        If the connexion type is not in the list of allowed connexion types, raise a ValueError
1014
1015        :return: The connexion type is being returned.
1016        """
1017        return self.get_config().get("connexion_type", "memory")

If the connexion type is not in the list of allowed connexion types, raise a ValueError

Returns

The connexion type is being returned.

def get_connexion(self):
1019    def get_connexion(self):
1020        """
1021        It returns the connection object
1022
1023        :return: The connection object.
1024        """
1025        return self.conn

It returns the connection object

Returns

The connection object.

def close_connexion(self) -> None:
1027    def close_connexion(self) -> None:
1028        """
1029        This function closes the connection to the database.
1030        :return: The connection is being closed.
1031        """
1032        return self.conn.close()

This function closes the connection to the database.

Returns

The connection is being closed.

def get_header(self, type: str = 'vcf'):
1034    def get_header(self, type: str = "vcf"):
1035        """
1036        This function returns the header of the VCF file as a list of strings
1037
1038        :param type: the type of header you want to get, defaults to vcf (optional)
1039        :return: The header of the vcf file.
1040        """
1041
1042        if self.header_vcf:
1043            if type == "vcf":
1044                return self.header_vcf
1045            elif type == "list":
1046                return self.header_list
1047        else:
1048            if type == "vcf":
1049                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
1050                return header
1051            elif type == "list":
1052                return vcf_required

This function returns the header of the VCF file as a list of strings

Parameters
  • type: the type of header you want to get, defaults to vcf (optional)
Returns

The header of the vcf file.

def get_header_length(self, file: str = None) -> int:
1054    def get_header_length(self, file: str = None) -> int:
1055        """
1056        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1057        line.
1058
1059        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1060        header file. If this argument is provided, the function will read the header from the specified
1061        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1062        :type file: str
1063        :return: the length of the header list, excluding the #CHROM line.
1064        """
1065
1066        if file:
1067            return len(self.read_vcf_header_file(file=file)) - 1
1068        elif self.get_header(type="list"):
1069            return len(self.get_header(type="list")) - 1
1070        else:
1071            return 0

The function get_header_length returns the length of the header list, excluding the #CHROM line.

Parameters
  • file: The file parameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns

the length of the header list, excluding the #CHROM line.

def get_header_columns(self) -> str:
1073    def get_header_columns(self) -> str:
1074        """
1075        This function returns the header list of a VCF
1076
1077        :return: The length of the header list.
1078        """
1079        if self.get_header():
1080            return self.get_header(type="list")[-1]
1081        else:
1082            return ""

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_list(self) -> list:
1084    def get_header_columns_as_list(self) -> list:
1085        """
1086        This function returns the header list of a VCF
1087
1088        :return: The length of the header list.
1089        """
1090        if self.get_header():
1091            return self.get_header_columns().strip().split("\t")
1092        else:
1093            return []

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_sql(self) -> str:
1095    def get_header_columns_as_sql(self) -> str:
1096        """
1097        This function retruns header length (without #CHROM line)
1098
1099        :return: The length of the header list.
1100        """
1101        sql_column_list = []
1102        for col in self.get_header_columns_as_list():
1103            sql_column_list.append(f'"{col}"')
1104        return ",".join(sql_column_list)

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_header_sample_list(self) -> list:
1106    def get_header_sample_list(self) -> list:
1107        """
1108        This function retruns header length (without #CHROM line)
1109
1110        :return: The length of the header list.
1111        """
1112        return self.header_vcf.samples

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_verbose(self) -> bool:
1114    def get_verbose(self) -> bool:
1115        """
1116        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1117        exist
1118
1119        :return: The value of the key "verbose" in the config dictionary.
1120        """
1121        return self.get_config().get("verbose", False)

It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist

Returns

The value of the key "verbose" in the config dictionary.

def get_connexion_format(self) -> str:
1123    def get_connexion_format(self) -> str:
1124        """
1125        It returns the connexion format of the object.
1126        :return: The connexion_format is being returned.
1127        """
1128        connexion_format = self.connexion_format
1129        if connexion_format not in ["duckdb", "sqlite"]:
1130            log.error(f"Unknown connexion format {connexion_format}")
1131            raise ValueError(f"Unknown connexion format {connexion_format}")
1132        else:
1133            return connexion_format

It returns the connexion format of the object.

Returns

The connexion_format is being returned.

def insert_file_to_table( self, file, columns: str, header_len: int = 0, sep: str = '\t', chunksize: int = 1000000) -> None:
1135    def insert_file_to_table(
1136        self,
1137        file,
1138        columns: str,
1139        header_len: int = 0,
1140        sep: str = "\t",
1141        chunksize: int = 1000000,
1142    ) -> None:
1143        """
1144        The function reads a file in chunks and inserts each chunk into a table based on the specified
1145        database format.
1146
1147        :param file: The `file` parameter is the file that you want to load into a table. It should be
1148        the path to the file on your system
1149        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
1150        should contain the names of the columns in the table where the data will be inserted. The column
1151        names should be separated by commas within the string. For example, if you have columns named
1152        "id", "name
1153        :type columns: str
1154        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
1155        the number of lines to skip at the beginning of the file before reading the actual data. This
1156        parameter allows you to skip any header information present in the file before processing the
1157        data, defaults to 0
1158        :type header_len: int (optional)
1159        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
1160        separator character that is used in the file being read. In this case, the default separator is
1161        set to `\t`, which represents a tab character. You can change this parameter to a different
1162        separator character if, defaults to \t
1163        :type sep: str (optional)
1164        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
1165        when processing the file in chunks. In the provided code snippet, the default value for
1166        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
1167        to 1000000
1168        :type chunksize: int (optional)
1169        """
1170
1171        # Config
1172        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1173        connexion_format = self.get_connexion_format()
1174
1175        log.debug("chunksize: " + str(chunksize))
1176
1177        if chunksize:
1178            for chunk in pd.read_csv(
1179                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1180            ):
1181                if connexion_format in ["duckdb"]:
1182                    sql_insert_into = (
1183                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1184                    )
1185                    self.conn.execute(sql_insert_into)
1186                elif connexion_format in ["sqlite"]:
1187                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)

The function reads a file in chunks and inserts each chunk into a table based on the specified database format.

Parameters
  • file: The file parameter is the file that you want to load into a table. It should be the path to the file on your system
  • columns: The columns parameter in the insert_file_to_table function is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name
  • header_len: The header_len parameter in the insert_file_to_table function specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0
  • sep: The sep parameter in the insert_file_to_table function is used to specify the separator character that is used in the file being read. In this case, the default separator is set to , which represents a tab character. You can change this parameter to a different separator character if, defaults to
  • chunksize: The chunksize parameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value for chunksize is set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
def load_data( self, input_file: str = None, drop_variants_table: bool = False, sample_size: int = 20480) -> None:
1189    def load_data(
1190        self,
1191        input_file: str = None,
1192        drop_variants_table: bool = False,
1193        sample_size: int = 20480,
1194    ) -> None:
1195        """
1196        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1197        table before loading the data and specify a sample size.
1198
1199        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1200        table
1201        :type input_file: str
1202        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1203        determines whether the variants table should be dropped before loading the data. If set to
1204        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1205        not be dropped, defaults to False
1206        :type drop_variants_table: bool (optional)
1207        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1208        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1209        20480
1210        :type sample_size: int (optional)
1211        """
1212
1213        log.info("Loading...")
1214
1215        # change input file
1216        if input_file:
1217            self.set_input(input_file)
1218            self.set_header()
1219
1220        # drop variants table
1221        if drop_variants_table:
1222            self.drop_variants_table()
1223
1224        # get table variants
1225        table_variants = self.get_table_variants()
1226
1227        # Access
1228        access = self.get_config().get("access", None)
1229        log.debug(f"access: {access}")
1230
1231        # Input format and compress
1232        input_format = self.get_input_format()
1233        input_compressed = self.get_input_compressed()
1234        log.debug(f"input_format: {input_format}")
1235        log.debug(f"input_compressed: {input_compressed}")
1236
1237        # input_compressed_format
1238        if input_compressed:
1239            input_compressed_format = "gzip"
1240        else:
1241            input_compressed_format = "none"
1242        log.debug(f"input_compressed_format: {input_compressed_format}")
1243
1244        # Connexion format
1245        connexion_format = self.get_connexion_format()
1246
1247        # Sample size
1248        if not sample_size:
1249            sample_size = -1
1250        log.debug(f"sample_size: {sample_size}")
1251
1252        # Load data
1253        log.debug(f"Load Data from {input_format}")
1254
1255        # DuckDB connexion
1256        if connexion_format in ["duckdb"]:
1257
1258            # Database already exists
1259            if self.input_format in ["db", "duckdb"]:
1260
1261                if connexion_format in ["duckdb"]:
1262                    log.debug(f"Input file format '{self.input_format}' duckDB")
1263                else:
1264                    log.error(
1265                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1266                    )
1267                    raise ValueError(
1268                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1269                    )
1270
1271            # Load from existing database format
1272            else:
1273
1274                try:
1275                    # Create Table or View
1276                    database = Database(database=self.input)
1277                    sql_from = database.get_sql_from(sample_size=sample_size)
1278
1279                    if access in ["RO"]:
1280                        sql_load = (
1281                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1282                        )
1283                    else:
1284                        sql_load = (
1285                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1286                        )
1287                    self.conn.execute(sql_load)
1288
1289                except:
1290                    # Format not available
1291                    log.error(f"Input file format '{self.input_format}' not available")
1292                    raise ValueError(
1293                        f"Input file format '{self.input_format}' not available"
1294                    )
1295
1296        # SQLite connexion
1297        elif connexion_format in ["sqlite"] and input_format in [
1298            "vcf",
1299            "tsv",
1300            "csv",
1301            "psv",
1302        ]:
1303
1304            # Main structure
1305            structure = {
1306                "#CHROM": "VARCHAR",
1307                "POS": "INTEGER",
1308                "ID": "VARCHAR",
1309                "REF": "VARCHAR",
1310                "ALT": "VARCHAR",
1311                "QUAL": "VARCHAR",
1312                "FILTER": "VARCHAR",
1313                "INFO": "VARCHAR",
1314            }
1315
1316            # Strcuture with samples
1317            structure_complete = structure
1318            if self.get_header_sample_list():
1319                structure["FORMAT"] = "VARCHAR"
1320                for sample in self.get_header_sample_list():
1321                    structure_complete[sample] = "VARCHAR"
1322
1323            # Columns list for create and insert
1324            sql_create_table_columns = []
1325            sql_create_table_columns_list = []
1326            for column in structure_complete:
1327                column_type = structure_complete[column]
1328                sql_create_table_columns.append(
1329                    f'"{column}" {column_type} default NULL'
1330                )
1331                sql_create_table_columns_list.append(f'"{column}"')
1332
1333            # Create database
1334            log.debug(f"Create Table {table_variants}")
1335            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1336            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1337            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1338            self.conn.execute(sql_create_table)
1339
1340            # chunksize define length of file chunk load file
1341            chunksize = 100000
1342
1343            # delimiter
1344            delimiter = file_format_delimiters.get(input_format, "\t")
1345
1346            # Load the input file
1347            with open(self.input, "rt") as input_file:
1348
1349                # Use the appropriate file handler based on the input format
1350                if input_compressed:
1351                    input_file = bgzf.open(self.input, "rt")
1352                if input_format in ["vcf"]:
1353                    header_len = self.get_header_length()
1354                else:
1355                    header_len = 0
1356
1357                # Insert the file contents into a table
1358                self.insert_file_to_table(
1359                    input_file,
1360                    columns=sql_create_table_columns_list_sql,
1361                    header_len=header_len,
1362                    sep=delimiter,
1363                    chunksize=chunksize,
1364                )
1365
1366        else:
1367            log.error(
1368                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1369            )
1370            raise ValueError(
1371                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1372            )
1373
1374        # Explode INFOS fields into table fields
1375        if self.get_explode_infos():
1376            self.explode_infos(
1377                prefix=self.get_explode_infos_prefix(),
1378                fields=self.get_explode_infos_fields(),
1379                force=True,
1380            )
1381
1382        # Create index after insertion
1383        self.create_indexes()

The load_data function reads a VCF file and inserts it into a table, with options to drop the table before loading the data and specify a sample size.

Parameters
  • input_file: The path to the input file. This is the VCF file that will be loaded into the table
  • drop_variants_table: The drop_variants_table parameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set to True, the variants table will be dropped. If set to False (default), the variants table will not be dropped, defaults to False
  • sample_size: The sample_size parameter determines the number of rows to be sampled from the input file. If it is set to None, the default value of 20480 will be used, defaults to 20480
def get_explode_infos(self) -> bool:
1385    def get_explode_infos(self) -> bool:
1386        """
1387        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1388        to False if it is not set.
1389        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1390        value. If the parameter is not present, it will return False.
1391        """
1392
1393        return self.get_param().get("explode", {}).get("explode_infos", False)

The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting to False if it is not set.

Returns

The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.

def get_explode_infos_fields( self, explode_infos_fields: str = None, remove_fields_not_in_header: bool = False) -> list:
1395    def get_explode_infos_fields(
1396        self,
1397        explode_infos_fields: str = None,
1398        remove_fields_not_in_header: bool = False,
1399    ) -> list:
1400        """
1401        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1402        the input parameter `explode_infos_fields`.
1403
1404        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1405        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1406        comma-separated list of field names to explode
1407        :type explode_infos_fields: str
1408        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1409        flag that determines whether to remove fields that are not present in the header. If it is set
1410        to `True`, any field that is not in the header will be excluded from the list of exploded
1411        information fields. If it is set to `, defaults to False
1412        :type remove_fields_not_in_header: bool (optional)
1413        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1414        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1415        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1416        Otherwise, it returns a list of exploded information fields after removing any spaces and
1417        splitting the string by commas.
1418        """
1419
1420        # If no fields, get it in param
1421        if not explode_infos_fields:
1422            explode_infos_fields = (
1423                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1424            )
1425
1426        # If no fields, defined as all fields in header using keyword
1427        if not explode_infos_fields:
1428            explode_infos_fields = "*"
1429
1430        # If fields list not empty
1431        if explode_infos_fields:
1432
1433            # Input fields list
1434            if isinstance(explode_infos_fields, str):
1435                fields_input = explode_infos_fields.split(",")
1436            elif isinstance(explode_infos_fields, list):
1437                fields_input = explode_infos_fields
1438            else:
1439                fields_input = []
1440
1441            # Fields list without * keyword
1442            fields_without_all = fields_input.copy()
1443            if "*".casefold() in (item.casefold() for item in fields_without_all):
1444                fields_without_all.remove("*")
1445
1446            # Fields in header
1447            fields_in_header = sorted(list(set(self.get_header().infos)))
1448
1449            # Construct list of fields
1450            fields_output = []
1451            for field in fields_input:
1452
1453                # Strip field
1454                field = field.strip()
1455
1456                # format keyword * in regex
1457                if field.upper() in ["*"]:
1458                    field = ".*"
1459
1460                # Find all fields with pattern
1461                r = re.compile(field)
1462                fields_search = sorted(list(filter(r.match, fields_in_header)))
1463
1464                # Remove fields input from search
1465                if field in fields_search:
1466                    fields_search = [field]
1467                elif fields_search != [field]:
1468                    fields_search = sorted(
1469                        list(set(fields_search).difference(fields_input))
1470                    )
1471
1472                # If field is not in header (avoid not well formatted header)
1473                if not fields_search and not remove_fields_not_in_header:
1474                    fields_search = [field]
1475
1476                # Add found fields
1477                for new_field in fields_search:
1478                    # Add field, if not already exists, and if it is in header (if asked)
1479                    if (
1480                        new_field not in fields_output
1481                        and (
1482                            not remove_fields_not_in_header
1483                            or new_field in fields_in_header
1484                        )
1485                        and new_field not in [".*"]
1486                    ):
1487                        fields_output.append(new_field)
1488
1489            return fields_output
1490
1491        else:
1492
1493            return []

The get_explode_infos_fields function returns a list of exploded information fields based on the input parameter explode_infos_fields.

Parameters
  • explode_infos_fields: The explode_infos_fields parameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode
  • remove_fields_not_in_header: The parameter remove_fields_not_in_header is a boolean flag that determines whether to remove fields that are not present in the header. If it is set to True, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns

The function get_explode_infos_fields returns a list of exploded information fields. If the explode_infos_fields parameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.

def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1495    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1496        """
1497        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1498        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1499        not provided.
1500
1501        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1502        prefix to be used for exploding or expanding information
1503        :type explode_infos_prefix: str
1504        :return: the value of the variable `explode_infos_prefix`.
1505        """
1506
1507        if not explode_infos_prefix:
1508            explode_infos_prefix = (
1509                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1510            )
1511
1512        return explode_infos_prefix

The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is not provided.

Parameters
  • explode_infos_prefix: The parameter explode_infos_prefix is a string that specifies a prefix to be used for exploding or expanding information
Returns

the value of the variable explode_infos_prefix.

def add_column( self, table_name, column_name, column_type, default_value=None, drop: bool = False) -> dict:
1514    def add_column(
1515        self,
1516        table_name,
1517        column_name,
1518        column_type,
1519        default_value=None,
1520        drop: bool = False,
1521    ) -> dict:
1522        """
1523        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1524        doesn't already exist.
1525
1526        :param table_name: The name of the table to which you want to add a column
1527        :param column_name: The parameter "column_name" is the name of the column that you want to add
1528        to the table
1529        :param column_type: The `column_type` parameter specifies the data type of the column that you
1530        want to add to the table. It should be a string that represents the desired data type, such as
1531        "INTEGER", "TEXT", "REAL", etc
1532        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1533        default value for the newly added column. If a default value is provided, it will be assigned to
1534        the column for any existing rows that do not have a value for that column
1535        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1536        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1537        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1538        to False
1539        :type drop: bool (optional)
1540        :return: a boolean value indicating whether the column was successfully added to the table.
1541        """
1542
1543        # added
1544        added = False
1545        dropped = False
1546
1547        # Check if the column already exists in the table
1548        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1549        columns = self.get_query_to_df(query).columns.tolist()
1550        if column_name.upper() in [c.upper() for c in columns]:
1551            log.debug(
1552                f"The {column_name} column already exists in the {table_name} table"
1553            )
1554            if drop:
1555                self.drop_column(table_name=table_name, column_name=column_name)
1556                dropped = True
1557            else:
1558                return None
1559        else:
1560            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1561
1562        # Add column in table
1563        add_column_query = (
1564            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1565        )
1566        if default_value is not None:
1567            add_column_query += f" DEFAULT {default_value}"
1568        self.execute_query(add_column_query)
1569        added = not dropped
1570        log.debug(
1571            f"The {column_name} column was successfully added to the {table_name} table"
1572        )
1573
1574        if added:
1575            added_column = {
1576                "table_name": table_name,
1577                "column_name": column_name,
1578                "column_type": column_type,
1579                "default_value": default_value,
1580            }
1581        else:
1582            added_column = None
1583
1584        return added_column

The add_column function adds a column to a SQLite or DuckDB table with a default value if it doesn't already exist.

Parameters
  • table_name: The name of the table to which you want to add a column
  • column_name: The parameter "column_name" is the name of the column that you want to add to the table
  • column_type: The column_type parameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc
  • default_value: The default_value parameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column
  • drop: The drop parameter is a boolean flag that determines whether to drop the column if it already exists in the table. If drop is set to True, the function will drop the existing column before adding the new column. If drop is set to False (default),, defaults to False
Returns

a boolean value indicating whether the column was successfully added to the table.

def drop_column( self, column: dict = None, table_name: str = None, column_name: str = None) -> bool:
1586    def drop_column(
1587        self, column: dict = None, table_name: str = None, column_name: str = None
1588    ) -> bool:
1589        """
1590        The `drop_column` function drops a specified column from a given table in a database and returns
1591        True if the column was successfully dropped, and False if the column does not exist in the
1592        table.
1593
1594        :param column: The `column` parameter is a dictionary that contains information about the column
1595        you want to drop. It has two keys:
1596        :type column: dict
1597        :param table_name: The `table_name` parameter is the name of the table from which you want to
1598        drop a column
1599        :type table_name: str
1600        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1601        from the table
1602        :type column_name: str
1603        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1604        and False if the column does not exist in the table.
1605        """
1606
1607        # Find column infos
1608        if column:
1609            if isinstance(column, dict):
1610                table_name = column.get("table_name", None)
1611                column_name = column.get("column_name", None)
1612            elif isinstance(column, str):
1613                table_name = self.get_table_variants()
1614                column_name = column
1615            else:
1616                table_name = None
1617                column_name = None
1618
1619        if not table_name and not column_name:
1620            return False
1621
1622        # Removed
1623        removed = False
1624
1625        # Check if the column already exists in the table
1626        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1627        columns = self.get_query_to_df(query).columns.tolist()
1628        if column_name in columns:
1629            log.debug(f"The {column_name} column exists in the {table_name} table")
1630        else:
1631            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1632            return False
1633
1634        # Add column in table # ALTER TABLE integers DROP k
1635        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1636        self.execute_query(add_column_query)
1637        removed = True
1638        log.debug(
1639            f"The {column_name} column was successfully dropped to the {table_name} table"
1640        )
1641
1642        return removed

The drop_column function drops a specified column from a given table in a database and returns True if the column was successfully dropped, and False if the column does not exist in the table.

Parameters
  • column: The column parameter is a dictionary that contains information about the column you want to drop. It has two keys:
  • table_name: The table_name parameter is the name of the table from which you want to drop a column
  • column_name: The column_name parameter is the name of the column that you want to drop from the table
Returns

a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.

def explode_infos( self, prefix: str = None, create_index: bool = False, fields: list = None, force: bool = False, proccess_all_fields_together: bool = False, table: str = None) -> list:
1644    def explode_infos(
1645        self,
1646        prefix: str = None,
1647        create_index: bool = False,
1648        fields: list = None,
1649        force: bool = False,
1650        proccess_all_fields_together: bool = False,
1651        table: str = None,
1652    ) -> list:
1653        """
1654        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
1655        individual columns, returning a list of added columns.
1656
1657        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1658        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1659        `self.get_explode_infos_prefix()` as the prefix
1660        :type prefix: str
1661        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1662        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1663        `False`, indexes will not be created. The default value is `False`, defaults to False
1664        :type create_index: bool (optional)
1665        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
1666        that you want to explode into individual columns. If this parameter is not provided, all INFO
1667        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
1668        a list to the `
1669        :type fields: list
1670        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
1671        determines whether to drop and recreate a column if it already exists in the table. If `force`
1672        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
1673        defaults to False
1674        :type force: bool (optional)
1675        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1676        flag that determines whether to process all the INFO fields together or individually. If set to
1677        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1678        be processed individually. The default value is, defaults to False
1679        :type proccess_all_fields_together: bool (optional)
1680        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
1681        of the table where the exploded INFO fields will be added as individual columns. If you provide
1682        a value for the `table` parameter, the function will use that table name. If the `table`
1683        parameter is
1684        :type table: str
1685        :return: The `explode_infos` function returns a list of added columns.
1686        """
1687
1688        # drop indexes
1689        self.drop_indexes()
1690
1691        # connexion format
1692        connexion_format = self.get_connexion_format()
1693
1694        # Access
1695        access = self.get_config().get("access", None)
1696
1697        # Added columns
1698        added_columns = []
1699
1700        if access not in ["RO"]:
1701
1702            # prefix
1703            if prefix in [None, True] or not isinstance(prefix, str):
1704                if self.get_explode_infos_prefix() not in [None, True]:
1705                    prefix = self.get_explode_infos_prefix()
1706                else:
1707                    prefix = "INFO/"
1708
1709            # table variants
1710            if table is not None:
1711                table_variants = table
1712            else:
1713                table_variants = self.get_table_variants(clause="select")
1714
1715            # extra infos
1716            try:
1717                extra_infos = self.get_extra_infos()
1718            except:
1719                extra_infos = []
1720
1721            # Header infos
1722            header_infos = self.get_header().infos
1723
1724            log.debug(
1725                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1726            )
1727
1728            sql_info_alter_table_array = []
1729
1730            # Info fields to check
1731            fields_list = list(header_infos)
1732            if fields:
1733                fields_list += fields
1734            fields_list = set(fields_list)
1735
1736            # If no fields
1737            if not fields:
1738                fields = []
1739
1740            # Translate fields if patterns
1741            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1742
1743            for info in fields:
1744
1745                info_id_sql = prefix + info
1746
1747                if (
1748                    info in fields_list
1749                    or prefix + info in fields_list
1750                    or info in extra_infos
1751                ):
1752
1753                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1754
1755                    if info in header_infos:
1756                        info_type = header_infos[info].type
1757                        info_num = header_infos[info].num
1758                    else:
1759                        info_type = "String"
1760                        info_num = 0
1761
1762                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1763                    if info_num != 1:
1764                        type_sql = "VARCHAR"
1765
1766                    # Add field
1767                    added_column = self.add_column(
1768                        table_name=table_variants,
1769                        column_name=info_id_sql,
1770                        column_type=type_sql,
1771                        default_value="null",
1772                        drop=force,
1773                    )
1774
1775                    if added_column:
1776                        added_columns.append(added_column)
1777
1778                    if added_column or force:
1779
1780                        # add field to index
1781                        self.index_additionnal_fields.append(info_id_sql)
1782
1783                        # Update field array
1784                        if connexion_format in ["duckdb"]:
1785                            update_info_field = f"""
1786                            "{info_id_sql}" =
1787                                CASE
1788                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1789                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1790                                END
1791                            """
1792                        elif connexion_format in ["sqlite"]:
1793                            update_info_field = f"""
1794                                "{info_id_sql}" =
1795                                    CASE
1796                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1797                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1798                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1799                                    END
1800                            """
1801
1802                        sql_info_alter_table_array.append(update_info_field)
1803
1804            if sql_info_alter_table_array:
1805
1806                # By chromosomes
1807                try:
1808                    chromosomes_list = list(
1809                        self.get_query_to_df(
1810                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1811                        )["#CHROM"]
1812                    )
1813                except:
1814                    chromosomes_list = [None]
1815
1816                for chrom in chromosomes_list:
1817                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1818
1819                    # Where clause
1820                    where_clause = ""
1821                    if chrom and len(chromosomes_list) > 1:
1822                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1823
1824                    # Update table
1825                    if proccess_all_fields_together:
1826                        sql_info_alter_table_array_join = ", ".join(
1827                            sql_info_alter_table_array
1828                        )
1829                        if sql_info_alter_table_array_join:
1830                            sql_info_alter_table = f"""
1831                                UPDATE {table_variants}
1832                                SET {sql_info_alter_table_array_join}
1833                                {where_clause}
1834                                """
1835                            log.debug(
1836                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1837                            )
1838                            # log.debug(sql_info_alter_table)
1839                            self.conn.execute(sql_info_alter_table)
1840                    else:
1841                        sql_info_alter_num = 0
1842                        for sql_info_alter in sql_info_alter_table_array:
1843                            sql_info_alter_num += 1
1844                            sql_info_alter_table = f"""
1845                                UPDATE {table_variants}
1846                                SET {sql_info_alter}
1847                                {where_clause}
1848                                """
1849                            log.debug(
1850                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1851                            )
1852                            # log.debug(sql_info_alter_table)
1853                            self.conn.execute(sql_info_alter_table)
1854
1855        # create indexes
1856        if create_index:
1857            self.create_indexes()
1858
1859        return added_columns

The explode_infos function in Python takes a VCF file and explodes the INFO fields into individual columns, returning a list of added columns.

Parameters
  • prefix: The prefix parameter is a string that is used as a prefix for the exploded INFO fields. If the prefix is not provided or is set to None, the function will use the value of self.get_explode_infos_prefix() as the prefix
  • create_index: The create_index parameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set to True, indexes will be created; if set to False, indexes will not be created. The default value is False, defaults to False
  • fields: The fields parameter in the explode_infos function is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the `
  • force: The force parameter in the explode_infos function is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. If force is set to True, the column will be dropped and recreated. If force is set to `False, defaults to False
  • proccess_all_fields_together: The proccess_all_fields_together parameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set to True, all the INFO fields will be processed together. If set to False, each INFO field will be processed individually. The default value is, defaults to False
  • table: The table parameter in the explode_infos function is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for the table parameter, the function will use that table name. If the table parameter is
Returns

The explode_infos function returns a list of added columns.

def create_indexes(self) -> None:
1861    def create_indexes(self) -> None:
1862        """
1863        Create indexes on the table after insertion
1864        """
1865
1866        # Access
1867        access = self.get_config().get("access", None)
1868
1869        # get table variants
1870        table_variants = self.get_table_variants("FROM")
1871
1872        if self.get_indexing() and access not in ["RO"]:
1873            # Create index
1874            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
1875            self.conn.execute(sql_create_table_index)
1876            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
1877            self.conn.execute(sql_create_table_index)
1878            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
1879            self.conn.execute(sql_create_table_index)
1880            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
1881            self.conn.execute(sql_create_table_index)
1882            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
1883            self.conn.execute(sql_create_table_index)
1884            for field in self.index_additionnal_fields:
1885                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
1886                self.conn.execute(sql_create_table_index)

Create indexes on the table after insertion

def drop_indexes(self) -> None:
1888    def drop_indexes(self) -> None:
1889        """
1890        Create indexes on the table after insertion
1891        """
1892
1893        # Access
1894        access = self.get_config().get("access", None)
1895
1896        # get table variants
1897        table_variants = self.get_table_variants("FROM")
1898
1899        # Get database format
1900        connexion_format = self.get_connexion_format()
1901
1902        if access not in ["RO"]:
1903            if connexion_format in ["duckdb"]:
1904                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
1905            elif connexion_format in ["sqlite"]:
1906                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
1907
1908            list_indexes = self.conn.execute(sql_list_indexes)
1909            index_names = [row[0] for row in list_indexes.fetchall()]
1910            for index in index_names:
1911                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
1912                self.conn.execute(sql_drop_table_index)

Create indexes on the table after insertion

def read_vcf_header(self, f) -> list:
1914    def read_vcf_header(self, f) -> list:
1915        """
1916        It reads the header of a VCF file and returns a list of the header lines
1917
1918        :param f: the file object
1919        :return: The header lines of the VCF file.
1920        """
1921
1922        header_list = []
1923        for line in f:
1924            header_list.append(line)
1925            if line.startswith("#CHROM"):
1926                break
1927        return header_list

It reads the header of a VCF file and returns a list of the header lines

Parameters
  • f: the file object
Returns

The header lines of the VCF file.

def read_vcf_header_file(self, file: str = None) -> list:
1929    def read_vcf_header_file(self, file: str = None) -> list:
1930        """
1931        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
1932        uncompressed files.
1933
1934        :param file: The `file` parameter is a string that represents the path to the VCF header file
1935        that you want to read. It is an optional parameter, so if you don't provide a value, it will
1936        default to `None`
1937        :type file: str
1938        :return: The function `read_vcf_header_file` returns a list.
1939        """
1940
1941        if self.get_input_compressed(input_file=file):
1942            with bgzf.open(file, "rt") as f:
1943                return self.read_vcf_header(f=f)
1944        else:
1945            with open(file, "rt") as f:
1946                return self.read_vcf_header(f=f)

The read_vcf_header_file function reads the header of a VCF file, handling both compressed and uncompressed files.

Parameters
  • file: The file parameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default to None
Returns

The function read_vcf_header_file returns a list.

def execute_query(self, query: str):
1948    def execute_query(self, query: str):
1949        """
1950        It takes a query as an argument, executes it, and returns the results
1951
1952        :param query: The query to be executed
1953        :return: The result of the query is being returned.
1954        """
1955        if query:
1956            return self.conn.execute(query)  # .fetchall()
1957        else:
1958            return None

It takes a query as an argument, executes it, and returns the results

Parameters
  • query: The query to be executed
Returns

The result of the query is being returned.

def export_output( self, output_file: str | None = None, output_header: str | None = None, export_header: bool = True, query: str | None = None, parquet_partitions: list | None = None, chunk_size: int | None = None, threads: int | None = None, sort: bool = False, index: bool = False, order_by: str | None = None) -> bool:
1960    def export_output(
1961        self,
1962        output_file: str | None = None,
1963        output_header: str | None = None,
1964        export_header: bool = True,
1965        query: str | None = None,
1966        parquet_partitions: list | None = None,
1967        chunk_size: int | None = None,
1968        threads: int | None = None,
1969        sort: bool = False,
1970        index: bool = False,
1971        order_by: str | None = None,
1972    ) -> bool:
1973        """
1974        The `export_output` function exports data from a VCF file to a specified output file in various
1975        formats, including VCF, CSV, TSV, PSV, and Parquet.
1976
1977        :param output_file: The `output_file` parameter is a string that specifies the name of the
1978        output file to be generated by the function. This is where the exported data will be saved
1979        :type output_file: str
1980        :param output_header: The `output_header` parameter is a string that specifies the name of the
1981        file where the header of the VCF file will be exported. If this parameter is not provided, the
1982        header will be exported to a file with the same name as the `output_file` parameter, but with
1983        the extension "
1984        :type output_header: str
1985        :param export_header: The `export_header` parameter is a boolean flag that determines whether
1986        the header of a VCF file should be exported to a separate file or not. If `export_header` is
1987        True, the header will be exported to a file. If `export_header` is False, the header will not
1988        be, defaults to True, if output format is not VCF
1989        :type export_header: bool (optional)
1990        :param query: The `query` parameter is an optional SQL query that can be used to filter and
1991        select specific data from the VCF file before exporting it. If provided, only the data that
1992        matches the query will be exported
1993        :type query: str
1994        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
1995        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
1996        organize data in a hierarchical directory structure based on the values of one or more columns.
1997        This can improve query performance when working with large datasets
1998        :type parquet_partitions: list
1999        :param chunk_size: The `chunk_size` parameter specifies the number of
2000        records in batch when exporting data in Parquet format. This parameter is used for
2001        partitioning the Parquet file into multiple files.
2002        :type chunk_size: int
2003        :param threads: The `threads` parameter is an optional parameter that specifies the number of
2004        threads to be used during the export process. It determines the level of parallelism and can
2005        improve the performance of the export operation. If not provided, the function will use the
2006        default number of threads
2007        :type threads: int
2008        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
2009        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
2010        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
2011        False
2012        :type sort: bool (optional)
2013        :param index: The `index` parameter is a boolean flag that determines whether an index should be
2014        created on the output file. If `index` is True, an index will be created. If `index` is False,
2015        no index will be created. The default value is False, defaults to False
2016        :type index: bool (optional)
2017        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
2018        sorting the output file. This parameter is only applicable when exporting data in VCF format
2019        :type order_by: str
2020        :return: a boolean value. It checks if the output file exists and returns True if it does, or
2021        None if it doesn't.
2022        """
2023
2024        # Log
2025        log.info("Exporting...")
2026
2027        # Full path
2028        output_file = full_path(output_file)
2029        output_header = full_path(output_header)
2030
2031        # Config
2032        config = self.get_config()
2033
2034        # Param
2035        param = self.get_param()
2036
2037        # Tmp files to remove
2038        tmp_to_remove = []
2039
2040        # If no output, get it
2041        if not output_file:
2042            output_file = self.get_output()
2043
2044        # If not threads
2045        if not threads:
2046            threads = self.get_threads()
2047
2048        # Auto header name with extension
2049        if export_header or output_header:
2050            if not output_header:
2051                output_header = f"{output_file}.hdr"
2052            # Export header
2053            self.export_header(output_file=output_file)
2054
2055        # Switch off export header if VCF output
2056        output_file_type = get_file_format(output_file)
2057        if output_file_type in ["vcf"]:
2058            export_header = False
2059            tmp_to_remove.append(output_header)
2060
2061        # Chunk size
2062        if not chunk_size:
2063            chunk_size = config.get("chunk_size", None)
2064
2065        # Parquet partition
2066        if not parquet_partitions:
2067            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
2068        if parquet_partitions and isinstance(parquet_partitions, str):
2069            parquet_partitions = parquet_partitions.split(",")
2070
2071        # Order by
2072        if not order_by:
2073            order_by = param.get("export", {}).get("order_by", "")
2074
2075        # Header in output
2076        header_in_output = param.get("export", {}).get("include_header", False)
2077
2078        # Database
2079        database_source = self.get_connexion()
2080
2081        # Connexion format
2082        connexion_format = self.get_connexion_format()
2083
2084        # Explode infos
2085        if self.get_explode_infos():
2086            self.explode_infos(
2087                prefix=self.get_explode_infos_prefix(),
2088                fields=self.get_explode_infos_fields(),
2089                force=False,
2090            )
2091
2092        # if connexion_format in ["sqlite"] or query:
2093        if connexion_format in ["sqlite"]:
2094
2095            # Export in Parquet
2096            random_tmp = "".join(
2097                random.choice(string.ascii_lowercase) for i in range(10)
2098            )
2099            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2100            tmp_to_remove.append(database_source)
2101
2102            # Table Variants
2103            table_variants = self.get_table_variants()
2104
2105            # Create export query
2106            sql_query_export_subquery = f"""
2107                SELECT * FROM {table_variants}
2108                """
2109
2110            # Write source file
2111            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2112
2113        # Create database
2114        database = Database(
2115            database=database_source,
2116            table="variants",
2117            header_file=output_header,
2118            conn_config=self.get_connexion_config(),
2119        )
2120
2121        # Existing colomns header
2122        # existing_columns_header = database.get_header_file_columns(output_header)
2123        existing_columns_header = database.get_header_columns_from_database()
2124
2125        # Export file
2126        database.export(
2127            output_database=output_file,
2128            output_header=output_header,
2129            existing_columns_header=existing_columns_header,
2130            parquet_partitions=parquet_partitions,
2131            chunk_size=chunk_size,
2132            threads=threads,
2133            sort=sort,
2134            index=index,
2135            header_in_output=header_in_output,
2136            order_by=order_by,
2137            query=query,
2138            export_header=export_header,
2139        )
2140
2141        # Remove
2142        remove_if_exists(tmp_to_remove)
2143
2144        return (os.path.exists(output_file) or None) and (
2145            os.path.exists(output_file) or None
2146        )

The export_output function exports data from a VCF file to a specified output file in various formats, including VCF, CSV, TSV, PSV, and Parquet.

Parameters
  • output_file: The output_file parameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved
  • output_header: The output_header parameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as the output_file parameter, but with the extension "
  • export_header: The export_header parameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. If export_header is True, the header will be exported to a file. If export_header is False, the header will not be, defaults to True, if output format is not VCF
  • query: The query parameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported
  • parquet_partitions: The parquet_partitions parameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets
  • chunk_size: The chunk_size parameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files.
  • threads: The threads parameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads
  • sort: The sort parameter is a boolean flag that determines whether the output file should be sorted or not. If sort is set to True, the output file will be sorted based on the genomic coordinates of the variants. By default, the value of sort is False, defaults to False
  • index: The index parameter is a boolean flag that determines whether an index should be created on the output file. If index is True, an index will be created. If index is False, no index will be created. The default value is False, defaults to False
  • order_by: The order_by parameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns

a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.

def get_extra_infos(self, table: str = None) -> list:
2148    def get_extra_infos(self, table: str = None) -> list:
2149        """
2150        The `get_extra_infos` function returns a list of columns that are in a specified table but not
2151        in the header.
2152
2153        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
2154        name of the table from which you want to retrieve the extra columns that are not present in the
2155        header. If the `table` parameter is not provided when calling the function, it will default to
2156        using the variants
2157        :type table: str
2158        :return: A list of columns that are in the specified table but not in the header of the table.
2159        """
2160
2161        header_columns = []
2162
2163        if not table:
2164            table = self.get_table_variants(clause="from")
2165            header_columns = self.get_header_columns()
2166
2167        # Check all columns in the database
2168        query = f""" SELECT * FROM {table} LIMIT 1 """
2169        log.debug(f"query {query}")
2170        table_columns = self.get_query_to_df(query).columns.tolist()
2171        extra_columns = []
2172
2173        # Construct extra infos (not in header)
2174        for column in table_columns:
2175            if column not in header_columns:
2176                extra_columns.append(column)
2177
2178        return extra_columns

The get_extra_infos function returns a list of columns that are in a specified table but not in the header.

Parameters
  • table: The table parameter in the get_extra_infos function is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If the table parameter is not provided when calling the function, it will default to using the variants
Returns

A list of columns that are in the specified table but not in the header of the table.

def get_extra_infos_sql(self, table: str = None) -> str:
2180    def get_extra_infos_sql(self, table: str = None) -> str:
2181        """
2182        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2183        by double quotes
2184
2185        :param table: The name of the table to get the extra infos from. If None, the default table is
2186        used
2187        :type table: str
2188        :return: A string of the extra infos
2189        """
2190
2191        return ", ".join(
2192            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2193        )

It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes

Parameters
  • table: The name of the table to get the extra infos from. If None, the default table is used
Returns

A string of the extra infos

def export_header( self, header_name: str = None, output_file: str = None, output_file_ext: str = '.hdr', clean_header: bool = True, remove_chrom_line: bool = False) -> str:
2195    def export_header(
2196        self,
2197        header_name: str = None,
2198        output_file: str = None,
2199        output_file_ext: str = ".hdr",
2200        clean_header: bool = True,
2201        remove_chrom_line: bool = False,
2202    ) -> str:
2203        """
2204        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2205        specified options, and writes it to a new file.
2206
2207        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2208        this parameter is not specified, the header will be written to the output file
2209        :type header_name: str
2210        :param output_file: The `output_file` parameter in the `export_header` function is used to
2211        specify the name of the output file where the header will be written. If this parameter is not
2212        provided, the header will be written to a temporary file
2213        :type output_file: str
2214        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2215        string that represents the extension of the output header file. By default, it is set to ".hdr"
2216        if not specified by the user. This extension will be appended to the `output_file` name to
2217        create the final, defaults to .hdr
2218        :type output_file_ext: str (optional)
2219        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2220        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2221        `True`, the function will clean the header by modifying certain lines based on a specific
2222        pattern. If `clean_header`, defaults to True
2223        :type clean_header: bool (optional)
2224        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2225        boolean flag that determines whether the #CHROM line should be removed from the header before
2226        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2227        defaults to False
2228        :type remove_chrom_line: bool (optional)
2229        :return: The function `export_header` returns the name of the temporary header file that is
2230        created.
2231        """
2232
2233        if not header_name and not output_file:
2234            output_file = self.get_output()
2235
2236        if self.get_header():
2237
2238            # Get header object
2239            header_obj = self.get_header()
2240
2241            # Create database
2242            db_for_header = Database(database=self.get_input())
2243
2244            # Get real columns in the file
2245            db_header_columns = db_for_header.get_columns()
2246
2247            with tempfile.TemporaryDirectory() as tmpdir:
2248
2249                # Write header file
2250                header_file_tmp = os.path.join(tmpdir, "header")
2251                f = open(header_file_tmp, "w")
2252                vcf.Writer(f, header_obj)
2253                f.close()
2254
2255                # Replace #CHROM line with rel columns
2256                header_list = db_for_header.read_header_file(
2257                    header_file=header_file_tmp
2258                )
2259                header_list[-1] = "\t".join(db_header_columns)
2260
2261                # Remove CHROM line
2262                if remove_chrom_line:
2263                    header_list.pop()
2264
2265                # Clean header
2266                if clean_header:
2267                    header_list_clean = []
2268                    for head in header_list:
2269                        # Clean head for malformed header
2270                        head_clean = head
2271                        head_clean = re.subn(
2272                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2273                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2274                            head_clean,
2275                            2,
2276                        )[0]
2277                        # Write header
2278                        header_list_clean.append(head_clean)
2279                    header_list = header_list_clean
2280
2281            tmp_header_name = output_file + output_file_ext
2282
2283            f = open(tmp_header_name, "w")
2284            for line in header_list:
2285                f.write(line)
2286            f.close()
2287
2288        return tmp_header_name

The export_header function takes a VCF file, extracts the header, modifies it according to specified options, and writes it to a new file.

Parameters
  • header_name: The header_name parameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file
  • output_file: The output_file parameter in the export_header function is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file
  • output_file_ext: The output_file_ext parameter in the export_header function is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to the output_file name to create the final, defaults to .hdr
  • clean_header: The clean_header parameter in the export_header function is a boolean flag that determines whether the header should be cleaned or not. When clean_header is set to True, the function will clean the header by modifying certain lines based on a specific pattern. If clean_header, defaults to True
  • remove_chrom_line: The remove_chrom_line parameter in the export_header function is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set to True, the #CHROM line will be removed; if set to `, defaults to False
Returns

The function export_header returns the name of the temporary header file that is created.

def export_variant_vcf( self, vcf_file, remove_info: bool = False, add_samples: bool = True, list_samples: list = [], where_clause: str = '', index: bool = False, threads: int | None = None) -> bool | None:
2290    def export_variant_vcf(
2291        self,
2292        vcf_file,
2293        remove_info: bool = False,
2294        add_samples: bool = True,
2295        list_samples: list = [],
2296        where_clause: str = "",
2297        index: bool = False,
2298        threads: int | None = None,
2299    ) -> bool | None:
2300        """
2301        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2302        remove INFO field, add samples, and control compression and indexing.
2303
2304        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2305        written to. It is the output file that will contain the filtered VCF data based on the specified
2306        parameters
2307        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2308        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2309        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2310        in, defaults to False
2311        :type remove_info: bool (optional)
2312        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2313        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2314        If set to False, the samples will be removed. The default value is True, defaults to True
2315        :type add_samples: bool (optional)
2316        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2317        in the output VCF file. By default, all samples will be included. If you provide a list of
2318        samples, only those samples will be included in the output file
2319        :type list_samples: list
2320        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2321        determines whether or not to create an index for the output VCF file. If `index` is set to
2322        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2323        :type index: bool (optional)
2324        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2325        number of threads to use for exporting the VCF file. It determines how many parallel threads
2326        will be used during the export process. More threads can potentially speed up the export process
2327        by utilizing multiple cores of the processor. If
2328        :type threads: int | None
2329        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2330        method with various parameters including the output file, query, threads, sort flag, and index
2331        flag. The `export_output` method is responsible for exporting the VCF data based on the
2332        specified parameters and configurations provided in the `export_variant_vcf` function.
2333        """
2334
2335        # Config
2336        config = self.get_config()
2337
2338        # Extract VCF
2339        log.debug("Export VCF...")
2340
2341        # Table variants
2342        table_variants = self.get_table_variants()
2343
2344        # Threads
2345        if not threads:
2346            threads = self.get_threads()
2347
2348        # Info fields
2349        if remove_info:
2350            if not isinstance(remove_info, str):
2351                remove_info = "."
2352            info_field = f"""'{remove_info}' as INFO"""
2353        else:
2354            info_field = "INFO"
2355
2356        # Samples fields
2357        if add_samples:
2358            if not list_samples:
2359                list_samples = self.get_header_sample_list()
2360            if list_samples:
2361                samples_fields = " , FORMAT , " + " , ".join(list_samples)
2362            else:
2363                samples_fields = ""
2364            log.debug(f"samples_fields: {samples_fields}")
2365        else:
2366            samples_fields = ""
2367
2368        # Where clause
2369        if where_clause is None:
2370            where_clause = ""
2371
2372        # Variants
2373        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2374        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
2375        log.debug(f"sql_query_select={sql_query_select}")
2376
2377        return self.export_output(
2378            output_file=vcf_file,
2379            output_header=None,
2380            export_header=True,
2381            query=sql_query_select,
2382            parquet_partitions=None,
2383            chunk_size=config.get("chunk_size", None),
2384            threads=threads,
2385            sort=True,
2386            index=index,
2387            order_by=None,
2388        )

The export_variant_vcf function exports a VCF file with specified samples, allowing options to remove INFO field, add samples, and control compression and indexing.

Parameters
  • vcf_file: The vcf_file parameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters
  • remove_info: The remove_info parameter in the export_variant_vcf function is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set to True, the INFO field will be removed. If set to False, the INFO field will be included in, defaults to False
  • add_samples: The add_samples parameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True
  • list_samples: The list_samples parameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file
  • index: The index parameter in the export_variant_vcf function is a boolean flag that determines whether or not to create an index for the output VCF file. If index is set to True, the output VCF file will be indexed using tabix. If index, defaults to False
  • threads: The threads parameter in the export_variant_vcf function specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns

The export_variant_vcf function returns the result of calling the export_output method with various parameters including the output file, query, threads, sort flag, and index flag. The export_output method is responsible for exporting the VCF data based on the specified parameters and configurations provided in the export_variant_vcf function.

def run_commands(self, commands: list = [], threads: int = 1) -> None:
2390    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2391        """
2392        It takes a list of commands and runs them in parallel using the number of threads specified
2393
2394        :param commands: A list of commands to run
2395        :param threads: The number of threads to use, defaults to 1 (optional)
2396        """
2397
2398        run_parallel_commands(commands, threads)

It takes a list of commands and runs them in parallel using the number of threads specified

Parameters
  • commands: A list of commands to run
  • threads: The number of threads to use, defaults to 1 (optional)
def get_threads(self, default: int = 1) -> int:
2400    def get_threads(self, default: int = 1) -> int:
2401        """
2402        This function returns the number of threads to use for a job, with a default value of 1 if not
2403        specified.
2404
2405        :param default: The `default` parameter in the `get_threads` method is used to specify the
2406        default number of threads to use if no specific value is provided. If no value is provided for
2407        the `threads` parameter in the configuration or input parameters, the `default` value will be
2408        used, defaults to 1
2409        :type default: int (optional)
2410        :return: the number of threads to use for the current job.
2411        """
2412
2413        # Config
2414        config = self.get_config()
2415
2416        # Param
2417        param = self.get_param()
2418
2419        # Input threads
2420        input_thread = param.get("threads", config.get("threads", None))
2421
2422        # Check threads
2423        if not input_thread:
2424            threads = default
2425        elif int(input_thread) <= 0:
2426            threads = os.cpu_count()
2427        else:
2428            threads = int(input_thread)
2429        return threads

This function returns the number of threads to use for a job, with a default value of 1 if not specified.

Parameters
  • default: The default parameter in the get_threads method is used to specify the default number of threads to use if no specific value is provided. If no value is provided for the threads parameter in the configuration or input parameters, the default value will be used, defaults to 1
Returns

the number of threads to use for the current job.

def get_memory(self, default: str = None) -> str:
2431    def get_memory(self, default: str = None) -> str:
2432        """
2433        This function retrieves the memory value from parameters or configuration with a default value
2434        if not found.
2435
2436        :param default: The `get_memory` function takes in a default value as a string parameter. This
2437        default value is used as a fallback in case the `memory` parameter is not provided in the
2438        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2439        the function
2440        :type default: str
2441        :return: The `get_memory` function returns a string value representing the memory parameter. If
2442        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2443        return the default value provided as an argument to the function.
2444        """
2445
2446        # Config
2447        config = self.get_config()
2448
2449        # Param
2450        param = self.get_param()
2451
2452        # Input threads
2453        input_memory = param.get("memory", config.get("memory", None))
2454
2455        # Check threads
2456        if input_memory:
2457            memory = input_memory
2458        else:
2459            memory = default
2460
2461        return memory

This function retrieves the memory value from parameters or configuration with a default value if not found.

Parameters
  • default: The get_memory function takes in a default value as a string parameter. This default value is used as a fallback in case the memory parameter is not provided in the param dictionary or the config dictionary. If memory is not found in either dictionary, the function
Returns

The get_memory function returns a string value representing the memory parameter. If the input_memory is provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.

def update_from_vcf(self, vcf_file: str) -> None:
2463    def update_from_vcf(self, vcf_file: str) -> None:
2464        """
2465        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2466
2467        :param vcf_file: the path to the VCF file
2468        """
2469
2470        connexion_format = self.get_connexion_format()
2471
2472        if connexion_format in ["duckdb"]:
2473            self.update_from_vcf_duckdb(vcf_file)
2474        elif connexion_format in ["sqlite"]:
2475            self.update_from_vcf_sqlite(vcf_file)

If the database is duckdb, then use the parquet method, otherwise use the sqlite method

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2477    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2478        """
2479        It takes a VCF file and updates the INFO column of the variants table in the database with the
2480        INFO column of the VCF file
2481
2482        :param vcf_file: the path to the VCF file
2483        """
2484
2485        # varaints table
2486        table_variants = self.get_table_variants()
2487
2488        # Loading VCF into temporaire table
2489        skip = self.get_header_length(file=vcf_file)
2490        vcf_df = pd.read_csv(
2491            vcf_file,
2492            sep="\t",
2493            engine="c",
2494            skiprows=skip,
2495            header=0,
2496            low_memory=False,
2497        )
2498        sql_query_update = f"""
2499        UPDATE {table_variants} as table_variants
2500            SET INFO = concat(
2501                            CASE
2502                                WHEN INFO NOT IN ('', '.')
2503                                THEN INFO
2504                                ELSE ''
2505                            END,
2506                            (
2507                                SELECT 
2508                                    concat(
2509                                        CASE
2510                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2511                                            THEN ';'
2512                                            ELSE ''
2513                                        END
2514                                        ,
2515                                        CASE
2516                                            WHEN table_parquet.INFO NOT IN ('','.')
2517                                            THEN table_parquet.INFO
2518                                            ELSE ''
2519                                        END
2520                                    )
2521                                FROM vcf_df as table_parquet
2522                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2523                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2524                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2525                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2526                                        AND table_parquet.INFO NOT IN ('','.')
2527                            )
2528                        )
2529            ;
2530            """
2531        self.conn.execute(sql_query_update)

It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2533    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2534        """
2535        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2536        table, then updates the INFO column of the variants table with the INFO column of the temporary
2537        table
2538
2539        :param vcf_file: The path to the VCF file you want to update the database with
2540        """
2541
2542        # Create a temporary table for the VCF
2543        table_vcf = "tmp_vcf"
2544        sql_create = (
2545            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2546        )
2547        self.conn.execute(sql_create)
2548
2549        # Loading VCF into temporaire table
2550        vcf_df = pd.read_csv(
2551            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2552        )
2553        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2554        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2555
2556        # Update table 'variants' with VCF data
2557        # warning: CONCAT as || operator
2558        sql_query_update = f"""
2559            UPDATE variants as table_variants
2560            SET INFO = CASE
2561                            WHEN INFO NOT IN ('', '.')
2562                            THEN INFO
2563                            ELSE ''
2564                        END ||
2565                        (
2566                        SELECT 
2567                            CASE 
2568                                WHEN table_variants.INFO NOT IN ('','.') 
2569                                    AND table_vcf.INFO NOT IN ('','.')  
2570                                THEN ';' 
2571                                ELSE '' 
2572                            END || 
2573                            CASE 
2574                                WHEN table_vcf.INFO NOT IN ('','.') 
2575                                THEN table_vcf.INFO 
2576                                ELSE '' 
2577                            END
2578                        FROM {table_vcf} as table_vcf
2579                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2580                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2581                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2582                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2583                        )
2584        """
2585        self.conn.execute(sql_query_update)
2586
2587        # Drop temporary table
2588        sql_drop = f"DROP TABLE {table_vcf}"
2589        self.conn.execute(sql_drop)

It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table

Parameters
  • vcf_file: The path to the VCF file you want to update the database with
def drop_variants_table(self) -> None:
2591    def drop_variants_table(self) -> None:
2592        """
2593        > This function drops the variants table
2594        """
2595
2596        table_variants = self.get_table_variants()
2597        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2598        self.conn.execute(sql_table_variants)

This function drops the variants table

def set_variant_id(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2600    def set_variant_id(
2601        self, variant_id_column: str = "variant_id", force: bool = None
2602    ) -> str:
2603        """
2604        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2605        `#CHROM`, `POS`, `REF`, and `ALT` columns
2606
2607        :param variant_id_column: The name of the column to be created in the variants table, defaults
2608        to variant_id
2609        :type variant_id_column: str (optional)
2610        :param force: If True, the variant_id column will be created even if it already exists
2611        :type force: bool
2612        :return: The name of the column that contains the variant_id
2613        """
2614
2615        # Assembly
2616        assembly = self.get_param().get(
2617            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2618        )
2619
2620        # INFO/Tag prefix
2621        prefix = self.get_explode_infos_prefix()
2622
2623        # Explode INFO/SVTYPE
2624        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2625
2626        # variants table
2627        table_variants = self.get_table_variants()
2628
2629        # variant_id column
2630        if not variant_id_column:
2631            variant_id_column = "variant_id"
2632
2633        # Creta variant_id column
2634        if "variant_id" not in self.get_extra_infos() or force:
2635
2636            # Create column
2637            self.add_column(
2638                table_name=table_variants,
2639                column_name=variant_id_column,
2640                column_type="UBIGINT",
2641                default_value="0",
2642            )
2643
2644            # Update column
2645            self.conn.execute(
2646                f"""
2647                    UPDATE {table_variants}
2648                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2649                """
2650            )
2651
2652        # Remove added columns
2653        for added_column in added_columns:
2654            self.drop_column(column=added_column)
2655
2656        # return variant_id column name
2657        return variant_id_column

It adds a column to the variants table called variant_id and populates it with a hash of the #CHROM, POS, REF, and ALT columns

Parameters
  • variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
  • force: If True, the variant_id column will be created even if it already exists
Returns

The name of the column that contains the variant_id

def get_variant_id_column(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2659    def get_variant_id_column(
2660        self, variant_id_column: str = "variant_id", force: bool = None
2661    ) -> str:
2662        """
2663        This function returns the variant_id column name
2664
2665        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2666        defaults to variant_id
2667        :type variant_id_column: str (optional)
2668        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2669        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2670        if it is not already set, or if it is set
2671        :type force: bool
2672        :return: The variant_id column name.
2673        """
2674
2675        return self.set_variant_id(variant_id_column=variant_id_column, force=force)

This function returns the variant_id column name

Parameters
  • variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
  • force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns

The variant_id column name.

def scan_databases( self, database_formats: list = ['parquet'], database_releases: list = ['current']) -> dict:
2681    def scan_databases(
2682        self,
2683        database_formats: list = ["parquet"],
2684        database_releases: list = ["current"],
2685    ) -> dict:
2686        """
2687        The function `scan_databases` scans for available databases based on specified formats and
2688        releases.
2689
2690        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2691        of the databases to be scanned. In this case, the accepted format is "parquet"
2692        :type database_formats: list ["parquet"]
2693        :param database_releases: The `database_releases` parameter is a list that specifies the
2694        releases of the databases to be scanned. In the provided function, the default value for
2695        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2696        databases that are in the "current"
2697        :type database_releases: list
2698        :return: The function `scan_databases` returns a dictionary containing information about
2699        databases that match the specified formats and releases.
2700        """
2701
2702        # Config
2703        config = self.get_config()
2704
2705        # Param
2706        param = self.get_param()
2707
2708        # Param - Assembly
2709        assembly = param.get("assembly", config.get("assembly", None))
2710        if not assembly:
2711            assembly = DEFAULT_ASSEMBLY
2712            log.warning(f"Default assembly '{assembly}'")
2713
2714        # Scan for availabled databases
2715        log.info(
2716            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2717        )
2718        databases_infos_dict = databases_infos(
2719            database_folder_releases=database_releases,
2720            database_formats=database_formats,
2721            assembly=assembly,
2722            config=config,
2723        )
2724        log.info(
2725            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2726        )
2727
2728        return databases_infos_dict

The function scan_databases scans for available databases based on specified formats and releases.

Parameters
  • database_formats: The database_formats parameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet"
  • database_releases: The database_releases parameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value for database_releases is set to ["current"], meaning that by default, the function will scan databases that are in the "current"
Returns

The function scan_databases returns a dictionary containing information about databases that match the specified formats and releases.

def annotation(self) -> None:
2730    def annotation(self) -> None:
2731        """
2732        It annotates the VCF file with the annotations specified in the config file.
2733        """
2734
2735        # Config
2736        config = self.get_config()
2737
2738        # Param
2739        param = self.get_param()
2740
2741        # Param - Assembly
2742        assembly = param.get("assembly", config.get("assembly", None))
2743        if not assembly:
2744            assembly = DEFAULT_ASSEMBLY
2745            log.warning(f"Default assembly '{assembly}'")
2746
2747        # annotations databases folders
2748        annotations_databases = set(
2749            config.get("folders", {})
2750            .get("databases", {})
2751            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2752            + config.get("folders", {})
2753            .get("databases", {})
2754            .get("parquet", ["~/howard/databases/parquet/current"])
2755            + config.get("folders", {})
2756            .get("databases", {})
2757            .get("bcftools", ["~/howard/databases/bcftools/current"])
2758        )
2759
2760        # Get param annotations
2761        if param.get("annotations", None) and isinstance(
2762            param.get("annotations", None), str
2763        ):
2764            log.debug(param.get("annotations", None))
2765            param_annotation_list = param.get("annotations").split(",")
2766        else:
2767            param_annotation_list = []
2768
2769        # Each tools param
2770        if param.get("annotation_parquet", None) != None:
2771            log.debug(
2772                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2773            )
2774            if isinstance(param.get("annotation_parquet", None), list):
2775                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2776            else:
2777                param_annotation_list.append(param.get("annotation_parquet"))
2778        if param.get("annotation_snpsift", None) != None:
2779            if isinstance(param.get("annotation_snpsift", None), list):
2780                param_annotation_list.append(
2781                    "snpsift:"
2782                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2783                )
2784            else:
2785                param_annotation_list.append(
2786                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2787                )
2788        if param.get("annotation_snpeff", None) != None:
2789            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2790        if param.get("annotation_bcftools", None) != None:
2791            if isinstance(param.get("annotation_bcftools", None), list):
2792                param_annotation_list.append(
2793                    "bcftools:"
2794                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2795                )
2796            else:
2797                param_annotation_list.append(
2798                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2799                )
2800        if param.get("annotation_annovar", None) != None:
2801            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2802        if param.get("annotation_exomiser", None) != None:
2803            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2804        if param.get("annotation_splice", None) != None:
2805            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2806
2807        # Merge param annotations list
2808        param["annotations"] = ",".join(param_annotation_list)
2809
2810        # debug
2811        log.debug(f"param_annotations={param['annotations']}")
2812
2813        if param.get("annotations"):
2814
2815            # Log
2816            # log.info("Annotations - Check annotation parameters")
2817
2818            if not "annotation" in param:
2819                param["annotation"] = {}
2820
2821            # List of annotations parameters
2822            annotations_list_input = {}
2823            if isinstance(param.get("annotations", None), str):
2824                annotation_file_list = [
2825                    value for value in param.get("annotations", "").split(",")
2826                ]
2827                for annotation_file in annotation_file_list:
2828                    annotations_list_input[annotation_file] = {"INFO": None}
2829            else:
2830                annotations_list_input = param.get("annotations", {})
2831
2832            log.info(f"Quick Annotations:")
2833            for annotation_key in list(annotations_list_input.keys()):
2834                log.info(f"   {annotation_key}")
2835
2836            # List of annotations and associated fields
2837            annotations_list = {}
2838
2839            for annotation_file in annotations_list_input:
2840
2841                # Explode annotations if ALL
2842                if (
2843                    annotation_file.upper() == "ALL"
2844                    or annotation_file.upper().startswith("ALL:")
2845                ):
2846
2847                    # check ALL parameters (formats, releases)
2848                    annotation_file_split = annotation_file.split(":")
2849                    database_formats = "parquet"
2850                    database_releases = "current"
2851                    for annotation_file_option in annotation_file_split[1:]:
2852                        database_all_options_split = annotation_file_option.split("=")
2853                        if database_all_options_split[0] == "format":
2854                            database_formats = database_all_options_split[1].split("+")
2855                        if database_all_options_split[0] == "release":
2856                            database_releases = database_all_options_split[1].split("+")
2857
2858                    # Scan for availabled databases
2859                    databases_infos_dict = self.scan_databases(
2860                        database_formats=database_formats,
2861                        database_releases=database_releases,
2862                    )
2863
2864                    # Add found databases in annotation parameters
2865                    for database_infos in databases_infos_dict.keys():
2866                        annotations_list[database_infos] = {"INFO": None}
2867
2868                else:
2869                    annotations_list[annotation_file] = annotations_list_input[
2870                        annotation_file
2871                    ]
2872
2873            # Check each databases
2874            if len(annotations_list):
2875
2876                log.info(
2877                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
2878                )
2879
2880                for annotation_file in annotations_list:
2881
2882                    # Init
2883                    annotations = annotations_list.get(annotation_file, None)
2884
2885                    # Annotation snpEff
2886                    if annotation_file.startswith("snpeff"):
2887
2888                        log.debug(f"Quick Annotation snpEff")
2889
2890                        if "snpeff" not in param["annotation"]:
2891                            param["annotation"]["snpeff"] = {}
2892
2893                        if "options" not in param["annotation"]["snpeff"]:
2894                            param["annotation"]["snpeff"]["options"] = ""
2895
2896                        # snpEff options in annotations
2897                        param["annotation"]["snpeff"]["options"] = "".join(
2898                            annotation_file.split(":")[1:]
2899                        )
2900
2901                    # Annotation Annovar
2902                    elif annotation_file.startswith("annovar"):
2903
2904                        log.debug(f"Quick Annotation Annovar")
2905
2906                        if "annovar" not in param["annotation"]:
2907                            param["annotation"]["annovar"] = {}
2908
2909                        if "annotations" not in param["annotation"]["annovar"]:
2910                            param["annotation"]["annovar"]["annotations"] = {}
2911
2912                        # Options
2913                        annotation_file_split = annotation_file.split(":")
2914                        for annotation_file_annotation in annotation_file_split[1:]:
2915                            if annotation_file_annotation:
2916                                param["annotation"]["annovar"]["annotations"][
2917                                    annotation_file_annotation
2918                                ] = annotations
2919
2920                    # Annotation Exomiser
2921                    elif annotation_file.startswith("exomiser"):
2922
2923                        log.debug(f"Quick Annotation Exomiser")
2924
2925                        param["annotation"]["exomiser"] = params_string_to_dict(
2926                            annotation_file
2927                        )
2928
2929                    # Annotation Splice
2930                    elif annotation_file.startswith("splice"):
2931
2932                        log.debug(f"Quick Annotation Splice")
2933
2934                        param["annotation"]["splice"] = params_string_to_dict(
2935                            annotation_file
2936                        )
2937
2938                    # Annotation Parquet or BCFTOOLS
2939                    else:
2940
2941                        # Tools detection
2942                        if annotation_file.startswith("bcftools:"):
2943                            annotation_tool_initial = "bcftools"
2944                            annotation_file = ":".join(annotation_file.split(":")[1:])
2945                        elif annotation_file.startswith("snpsift:"):
2946                            annotation_tool_initial = "snpsift"
2947                            annotation_file = ":".join(annotation_file.split(":")[1:])
2948                        else:
2949                            annotation_tool_initial = None
2950
2951                        # list of files
2952                        annotation_file_list = annotation_file.replace("+", ":").split(
2953                            ":"
2954                        )
2955
2956                        for annotation_file in annotation_file_list:
2957
2958                            if annotation_file:
2959
2960                                # Annotation tool initial
2961                                annotation_tool = annotation_tool_initial
2962
2963                                # Find file
2964                                annotation_file_found = None
2965
2966                                # Expand user
2967                                annotation_file = full_path(annotation_file)
2968
2969                                if os.path.exists(annotation_file):
2970                                    annotation_file_found = annotation_file
2971
2972                                else:
2973                                    # Find within assembly folders
2974                                    for annotations_database in annotations_databases:
2975                                        found_files = find_all(
2976                                            annotation_file,
2977                                            os.path.join(
2978                                                annotations_database, assembly
2979                                            ),
2980                                        )
2981                                        if len(found_files) > 0:
2982                                            annotation_file_found = found_files[0]
2983                                            break
2984                                    if not annotation_file_found and not assembly:
2985                                        # Find within folders
2986                                        for (
2987                                            annotations_database
2988                                        ) in annotations_databases:
2989                                            found_files = find_all(
2990                                                annotation_file, annotations_database
2991                                            )
2992                                            if len(found_files) > 0:
2993                                                annotation_file_found = found_files[0]
2994                                                break
2995                                log.debug(
2996                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
2997                                )
2998
2999                                # Full path
3000                                annotation_file_found = full_path(annotation_file_found)
3001
3002                                if annotation_file_found:
3003
3004                                    database = Database(database=annotation_file_found)
3005                                    quick_annotation_format = database.get_format()
3006                                    quick_annotation_is_compressed = (
3007                                        database.is_compressed()
3008                                    )
3009                                    quick_annotation_is_indexed = os.path.exists(
3010                                        f"{annotation_file_found}.tbi"
3011                                    )
3012                                    bcftools_preference = False
3013
3014                                    # Check Annotation Tool
3015                                    if not annotation_tool:
3016                                        if (
3017                                            bcftools_preference
3018                                            and quick_annotation_format
3019                                            in ["vcf", "bed"]
3020                                            and quick_annotation_is_compressed
3021                                            and quick_annotation_is_indexed
3022                                        ):
3023                                            annotation_tool = "bcftools"
3024                                        elif quick_annotation_format in [
3025                                            "vcf",
3026                                            "bed",
3027                                            "tsv",
3028                                            "tsv",
3029                                            "csv",
3030                                            "json",
3031                                            "tbl",
3032                                            "parquet",
3033                                            "duckdb",
3034                                        ]:
3035                                            annotation_tool = "parquet"
3036                                        else:
3037                                            log.error(
3038                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3039                                            )
3040                                            raise ValueError(
3041                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3042                                            )
3043
3044                                    log.debug(
3045                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
3046                                    )
3047
3048                                    # Annotation Tool dispatch
3049                                    if annotation_tool:
3050                                        if annotation_tool not in param["annotation"]:
3051                                            param["annotation"][annotation_tool] = {}
3052                                        if (
3053                                            "annotations"
3054                                            not in param["annotation"][annotation_tool]
3055                                        ):
3056                                            param["annotation"][annotation_tool][
3057                                                "annotations"
3058                                            ] = {}
3059                                        param["annotation"][annotation_tool][
3060                                            "annotations"
3061                                        ][annotation_file_found] = annotations
3062
3063                                else:
3064                                    log.error(
3065                                        f"Quick Annotation File {annotation_file} does NOT exist"
3066                                    )
3067
3068                self.set_param(param)
3069
3070        if param.get("annotation", None):
3071            log.info("Annotations")
3072            if param.get("annotation", {}).get("parquet", None):
3073                log.info("Annotations 'parquet'...")
3074                self.annotation_parquet()
3075            if param.get("annotation", {}).get("bcftools", None):
3076                log.info("Annotations 'bcftools'...")
3077                self.annotation_bcftools()
3078            if param.get("annotation", {}).get("snpsift", None):
3079                log.info("Annotations 'snpsift'...")
3080                self.annotation_snpsift()
3081            if param.get("annotation", {}).get("annovar", None):
3082                log.info("Annotations 'annovar'...")
3083                self.annotation_annovar()
3084            if param.get("annotation", {}).get("snpeff", None):
3085                log.info("Annotations 'snpeff'...")
3086                self.annotation_snpeff()
3087            if param.get("annotation", {}).get("exomiser", None) is not None:
3088                log.info("Annotations 'exomiser'...")
3089                self.annotation_exomiser()
3090            if param.get("annotation", {}).get("splice", None) is not None:
3091                log.info("Annotations 'splice' ...")
3092                self.annotation_splice()
3093
3094        # Explode INFOS fields into table fields
3095        if self.get_explode_infos():
3096            self.explode_infos(
3097                prefix=self.get_explode_infos_prefix(),
3098                fields=self.get_explode_infos_fields(),
3099                force=True,
3100            )

It annotates the VCF file with the annotations specified in the config file.

def annotation_snpsift(self, threads: int = None) -> None:
3102    def annotation_snpsift(self, threads: int = None) -> None:
3103        """
3104        This function annotate with bcftools
3105
3106        :param threads: Number of threads to use
3107        :return: the value of the variable "return_value".
3108        """
3109
3110        # DEBUG
3111        log.debug("Start annotation with bcftools databases")
3112
3113        # Threads
3114        if not threads:
3115            threads = self.get_threads()
3116        log.debug("Threads: " + str(threads))
3117
3118        # Config
3119        config = self.get_config()
3120        log.debug("Config: " + str(config))
3121
3122        # Config - snpSift
3123        snpsift_bin_command = get_bin_command(
3124            bin="SnpSift.jar",
3125            tool="snpsift",
3126            bin_type="jar",
3127            config=config,
3128            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3129        )
3130        if not snpsift_bin_command:
3131            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3132            log.error(msg_err)
3133            raise ValueError(msg_err)
3134
3135        # Config - bcftools
3136        bcftools_bin_command = get_bin_command(
3137            bin="bcftools",
3138            tool="bcftools",
3139            bin_type="bin",
3140            config=config,
3141            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3142        )
3143        if not bcftools_bin_command:
3144            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3145            log.error(msg_err)
3146            raise ValueError(msg_err)
3147
3148        # Config - BCFTools databases folders
3149        databases_folders = set(
3150            self.get_config()
3151            .get("folders", {})
3152            .get("databases", {})
3153            .get("annotations", ["."])
3154            + self.get_config()
3155            .get("folders", {})
3156            .get("databases", {})
3157            .get("bcftools", ["."])
3158        )
3159        log.debug("Databases annotations: " + str(databases_folders))
3160
3161        # Param
3162        annotations = (
3163            self.get_param()
3164            .get("annotation", {})
3165            .get("snpsift", {})
3166            .get("annotations", None)
3167        )
3168        log.debug("Annotations: " + str(annotations))
3169
3170        # Assembly
3171        assembly = self.get_param().get(
3172            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3173        )
3174
3175        # Data
3176        table_variants = self.get_table_variants()
3177
3178        # Check if not empty
3179        log.debug("Check if not empty")
3180        sql_query_chromosomes = (
3181            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3182        )
3183        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3184        if not sql_query_chromosomes_df["count"][0]:
3185            log.info(f"VCF empty")
3186            return
3187
3188        # VCF header
3189        vcf_reader = self.get_header()
3190        log.debug("Initial header: " + str(vcf_reader.infos))
3191
3192        # Existing annotations
3193        for vcf_annotation in self.get_header().infos:
3194
3195            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3196            log.debug(
3197                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3198            )
3199
3200        if annotations:
3201
3202            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3203
3204                # Export VCF file
3205                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3206
3207                # Init
3208                commands = {}
3209
3210                for annotation in annotations:
3211                    annotation_fields = annotations[annotation]
3212
3213                    # Annotation Name
3214                    annotation_name = os.path.basename(annotation)
3215
3216                    if not annotation_fields:
3217                        annotation_fields = {"INFO": None}
3218
3219                    log.debug(f"Annotation '{annotation_name}'")
3220                    log.debug(
3221                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3222                    )
3223
3224                    # Create Database
3225                    database = Database(
3226                        database=annotation,
3227                        databases_folders=databases_folders,
3228                        assembly=assembly,
3229                    )
3230
3231                    # Find files
3232                    db_file = database.get_database()
3233                    db_file = full_path(db_file)
3234                    db_hdr_file = database.get_header_file()
3235                    db_hdr_file = full_path(db_hdr_file)
3236                    db_file_type = database.get_format()
3237                    db_tbi_file = f"{db_file}.tbi"
3238                    db_file_compressed = database.is_compressed()
3239
3240                    # Check if compressed
3241                    if not db_file_compressed:
3242                        log.error(
3243                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3244                        )
3245                        raise ValueError(
3246                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3247                        )
3248
3249                    # Check if indexed
3250                    if not os.path.exists(db_tbi_file):
3251                        log.error(
3252                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3253                        )
3254                        raise ValueError(
3255                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3256                        )
3257
3258                    # Check index - try to create if not exists
3259                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3260                        log.error("Annotation failed: database not valid")
3261                        log.error(f"Annotation annotation file: {db_file}")
3262                        log.error(f"Annotation annotation header: {db_hdr_file}")
3263                        log.error(f"Annotation annotation index: {db_tbi_file}")
3264                        raise ValueError(
3265                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3266                        )
3267                    else:
3268
3269                        log.debug(
3270                            f"Annotation '{annotation}' - file: "
3271                            + str(db_file)
3272                            + " and "
3273                            + str(db_hdr_file)
3274                        )
3275
3276                        # Load header as VCF object
3277                        db_hdr_vcf = Variants(input=db_hdr_file)
3278                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3279                        log.debug(
3280                            "Annotation database header: "
3281                            + str(db_hdr_vcf_header_infos)
3282                        )
3283
3284                        # For all fields in database
3285                        annotation_fields_full = False
3286                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3287                            annotation_fields = {
3288                                key: key for key in db_hdr_vcf_header_infos
3289                            }
3290                            log.debug(
3291                                "Annotation database header - All annotations added: "
3292                                + str(annotation_fields)
3293                            )
3294                            annotation_fields_full = True
3295
3296                        # # Create file for field rename
3297                        # log.debug("Create file for field rename")
3298                        # tmp_rename = NamedTemporaryFile(
3299                        #     prefix=self.get_prefix(),
3300                        #     dir=self.get_tmp_dir(),
3301                        #     suffix=".rename",
3302                        #     delete=False,
3303                        # )
3304                        # tmp_rename_name = tmp_rename.name
3305                        # tmp_files.append(tmp_rename_name)
3306
3307                        # Number of fields
3308                        nb_annotation_field = 0
3309                        annotation_list = []
3310                        annotation_infos_rename_list = []
3311
3312                        for annotation_field in annotation_fields:
3313
3314                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3315                            annotation_fields_new_name = annotation_fields.get(
3316                                annotation_field, annotation_field
3317                            )
3318                            if not annotation_fields_new_name:
3319                                annotation_fields_new_name = annotation_field
3320
3321                            # Check if field is in DB and if field is not elready in input data
3322                            if (
3323                                annotation_field in db_hdr_vcf.get_header().infos
3324                                and annotation_fields_new_name
3325                                not in self.get_header().infos
3326                            ):
3327
3328                                log.info(
3329                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3330                                )
3331
3332                                # BCFTools annotate param to rename fields
3333                                if annotation_field != annotation_fields_new_name:
3334                                    annotation_infos_rename_list.append(
3335                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3336                                    )
3337
3338                                # Add INFO field to header
3339                                db_hdr_vcf_header_infos_number = (
3340                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3341                                )
3342                                db_hdr_vcf_header_infos_type = (
3343                                    db_hdr_vcf_header_infos[annotation_field].type
3344                                    or "String"
3345                                )
3346                                db_hdr_vcf_header_infos_description = (
3347                                    db_hdr_vcf_header_infos[annotation_field].desc
3348                                    or f"{annotation_field} description"
3349                                )
3350                                db_hdr_vcf_header_infos_source = (
3351                                    db_hdr_vcf_header_infos[annotation_field].source
3352                                    or "unknown"
3353                                )
3354                                db_hdr_vcf_header_infos_version = (
3355                                    db_hdr_vcf_header_infos[annotation_field].version
3356                                    or "unknown"
3357                                )
3358
3359                                vcf_reader.infos[annotation_fields_new_name] = (
3360                                    vcf.parser._Info(
3361                                        annotation_fields_new_name,
3362                                        db_hdr_vcf_header_infos_number,
3363                                        db_hdr_vcf_header_infos_type,
3364                                        db_hdr_vcf_header_infos_description,
3365                                        db_hdr_vcf_header_infos_source,
3366                                        db_hdr_vcf_header_infos_version,
3367                                        self.code_type_map[
3368                                            db_hdr_vcf_header_infos_type
3369                                        ],
3370                                    )
3371                                )
3372
3373                                annotation_list.append(annotation_field)
3374
3375                                nb_annotation_field += 1
3376
3377                            else:
3378
3379                                if (
3380                                    annotation_field
3381                                    not in db_hdr_vcf.get_header().infos
3382                                ):
3383                                    log.warning(
3384                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3385                                    )
3386                                if (
3387                                    annotation_fields_new_name
3388                                    in self.get_header().infos
3389                                ):
3390                                    log.warning(
3391                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3392                                    )
3393
3394                        log.info(
3395                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3396                        )
3397
3398                        annotation_infos = ",".join(annotation_list)
3399
3400                        if annotation_infos != "":
3401
3402                            # Annotated VCF (and error file)
3403                            tmp_annotation_vcf_name = os.path.join(
3404                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3405                            )
3406                            tmp_annotation_vcf_name_err = (
3407                                tmp_annotation_vcf_name + ".err"
3408                            )
3409
3410                            # Add fields to annotate
3411                            if not annotation_fields_full:
3412                                annotation_infos_option = f"-info {annotation_infos}"
3413                            else:
3414                                annotation_infos_option = ""
3415
3416                            # Info fields rename
3417                            if annotation_infos_rename_list:
3418                                annotation_infos_rename = " -c " + ",".join(
3419                                    annotation_infos_rename_list
3420                                )
3421                            else:
3422                                annotation_infos_rename = ""
3423
3424                            # Annotate command
3425                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3426
3427                            # Add command
3428                            commands[command_annotate] = tmp_annotation_vcf_name
3429
3430                if commands:
3431
3432                    # Export VCF file
3433                    self.export_variant_vcf(
3434                        vcf_file=tmp_vcf_name,
3435                        remove_info=True,
3436                        add_samples=False,
3437                        index=True,
3438                    )
3439                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3440
3441                    # Num command
3442                    nb_command = 0
3443
3444                    # Annotate
3445                    for command_annotate in commands:
3446                        nb_command += 1
3447                        log.info(
3448                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3449                        )
3450                        log.debug(f"command_annotate={command_annotate}")
3451                        run_parallel_commands([command_annotate], threads)
3452
3453                        # Debug
3454                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3455
3456                        # Update variants
3457                        log.info(
3458                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3459                        )
3460                        self.update_from_vcf(commands[command_annotate])

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_bcftools(self, threads: int = None) -> None:
3462    def annotation_bcftools(self, threads: int = None) -> None:
3463        """
3464        This function annotate with bcftools
3465
3466        :param threads: Number of threads to use
3467        :return: the value of the variable "return_value".
3468        """
3469
3470        # DEBUG
3471        log.debug("Start annotation with bcftools databases")
3472
3473        # Threads
3474        if not threads:
3475            threads = self.get_threads()
3476        log.debug("Threads: " + str(threads))
3477
3478        # Config
3479        config = self.get_config()
3480        log.debug("Config: " + str(config))
3481
3482        # DEBUG
3483        delete_tmp = True
3484        if self.get_config().get("verbosity", "warning") in ["debug"]:
3485            delete_tmp = False
3486            log.debug("Delete tmp files/folders: " + str(delete_tmp))
3487
3488        # Config - BCFTools bin command
3489        bcftools_bin_command = get_bin_command(
3490            bin="bcftools",
3491            tool="bcftools",
3492            bin_type="bin",
3493            config=config,
3494            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3495        )
3496        if not bcftools_bin_command:
3497            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3498            log.error(msg_err)
3499            raise ValueError(msg_err)
3500
3501        # Config - BCFTools databases folders
3502        databases_folders = set(
3503            self.get_config()
3504            .get("folders", {})
3505            .get("databases", {})
3506            .get("annotations", ["."])
3507            + self.get_config()
3508            .get("folders", {})
3509            .get("databases", {})
3510            .get("bcftools", ["."])
3511        )
3512        log.debug("Databases annotations: " + str(databases_folders))
3513
3514        # Param
3515        annotations = (
3516            self.get_param()
3517            .get("annotation", {})
3518            .get("bcftools", {})
3519            .get("annotations", None)
3520        )
3521        log.debug("Annotations: " + str(annotations))
3522
3523        # Assembly
3524        assembly = self.get_param().get(
3525            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3526        )
3527
3528        # Data
3529        table_variants = self.get_table_variants()
3530
3531        # Check if not empty
3532        log.debug("Check if not empty")
3533        sql_query_chromosomes = (
3534            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3535        )
3536        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3537        if not sql_query_chromosomes_df["count"][0]:
3538            log.info(f"VCF empty")
3539            return
3540
3541        # Export in VCF
3542        log.debug("Create initial file to annotate")
3543        tmp_vcf = NamedTemporaryFile(
3544            prefix=self.get_prefix(),
3545            dir=self.get_tmp_dir(),
3546            suffix=".vcf.gz",
3547            delete=False,
3548        )
3549        tmp_vcf_name = tmp_vcf.name
3550
3551        # VCF header
3552        vcf_reader = self.get_header()
3553        log.debug("Initial header: " + str(vcf_reader.infos))
3554
3555        # Existing annotations
3556        for vcf_annotation in self.get_header().infos:
3557
3558            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3559            log.debug(
3560                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3561            )
3562
3563        if annotations:
3564
3565            tmp_ann_vcf_list = []
3566            commands = []
3567            tmp_files = []
3568            err_files = []
3569
3570            for annotation in annotations:
3571                annotation_fields = annotations[annotation]
3572
3573                # Annotation Name
3574                annotation_name = os.path.basename(annotation)
3575
3576                if not annotation_fields:
3577                    annotation_fields = {"INFO": None}
3578
3579                log.debug(f"Annotation '{annotation_name}'")
3580                log.debug(
3581                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3582                )
3583
3584                # Create Database
3585                database = Database(
3586                    database=annotation,
3587                    databases_folders=databases_folders,
3588                    assembly=assembly,
3589                )
3590
3591                # Find files
3592                db_file = database.get_database()
3593                db_file = full_path(db_file)
3594                db_hdr_file = database.get_header_file()
3595                db_hdr_file = full_path(db_hdr_file)
3596                db_file_type = database.get_format()
3597                db_tbi_file = f"{db_file}.tbi"
3598                db_file_compressed = database.is_compressed()
3599
3600                # Check if compressed
3601                if not db_file_compressed:
3602                    log.error(
3603                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3604                    )
3605                    raise ValueError(
3606                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3607                    )
3608
3609                # Check if indexed
3610                if not os.path.exists(db_tbi_file):
3611                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
3612                    raise ValueError(
3613                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
3614                    )
3615
3616                # Check index - try to create if not exists
3617                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3618                    log.error("Annotation failed: database not valid")
3619                    log.error(f"Annotation annotation file: {db_file}")
3620                    log.error(f"Annotation annotation header: {db_hdr_file}")
3621                    log.error(f"Annotation annotation index: {db_tbi_file}")
3622                    raise ValueError(
3623                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3624                    )
3625                else:
3626
3627                    log.debug(
3628                        f"Annotation '{annotation}' - file: "
3629                        + str(db_file)
3630                        + " and "
3631                        + str(db_hdr_file)
3632                    )
3633
3634                    # Load header as VCF object
3635                    db_hdr_vcf = Variants(input=db_hdr_file)
3636                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3637                    log.debug(
3638                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
3639                    )
3640
3641                    # For all fields in database
3642                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
3643                        annotation_fields = {
3644                            key: key for key in db_hdr_vcf_header_infos
3645                        }
3646                        log.debug(
3647                            "Annotation database header - All annotations added: "
3648                            + str(annotation_fields)
3649                        )
3650
3651                    # Number of fields
3652                    nb_annotation_field = 0
3653                    annotation_list = []
3654
3655                    for annotation_field in annotation_fields:
3656
3657                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3658                        annotation_fields_new_name = annotation_fields.get(
3659                            annotation_field, annotation_field
3660                        )
3661                        if not annotation_fields_new_name:
3662                            annotation_fields_new_name = annotation_field
3663
3664                        # Check if field is in DB and if field is not elready in input data
3665                        if (
3666                            annotation_field in db_hdr_vcf.get_header().infos
3667                            and annotation_fields_new_name
3668                            not in self.get_header().infos
3669                        ):
3670
3671                            log.info(
3672                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3673                            )
3674
3675                            # Add INFO field to header
3676                            db_hdr_vcf_header_infos_number = (
3677                                db_hdr_vcf_header_infos[annotation_field].num or "."
3678                            )
3679                            db_hdr_vcf_header_infos_type = (
3680                                db_hdr_vcf_header_infos[annotation_field].type
3681                                or "String"
3682                            )
3683                            db_hdr_vcf_header_infos_description = (
3684                                db_hdr_vcf_header_infos[annotation_field].desc
3685                                or f"{annotation_field} description"
3686                            )
3687                            db_hdr_vcf_header_infos_source = (
3688                                db_hdr_vcf_header_infos[annotation_field].source
3689                                or "unknown"
3690                            )
3691                            db_hdr_vcf_header_infos_version = (
3692                                db_hdr_vcf_header_infos[annotation_field].version
3693                                or "unknown"
3694                            )
3695
3696                            vcf_reader.infos[annotation_fields_new_name] = (
3697                                vcf.parser._Info(
3698                                    annotation_fields_new_name,
3699                                    db_hdr_vcf_header_infos_number,
3700                                    db_hdr_vcf_header_infos_type,
3701                                    db_hdr_vcf_header_infos_description,
3702                                    db_hdr_vcf_header_infos_source,
3703                                    db_hdr_vcf_header_infos_version,
3704                                    self.code_type_map[db_hdr_vcf_header_infos_type],
3705                                )
3706                            )
3707
3708                            # annotation_list.append(annotation_field)
3709                            if annotation_field != annotation_fields_new_name:
3710                                annotation_list.append(
3711                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3712                                )
3713                            else:
3714                                annotation_list.append(annotation_field)
3715
3716                            nb_annotation_field += 1
3717
3718                        else:
3719
3720                            if annotation_field not in db_hdr_vcf.get_header().infos:
3721                                log.warning(
3722                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
3723                                )
3724                            if annotation_fields_new_name in self.get_header().infos:
3725                                log.warning(
3726                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
3727                                )
3728
3729                    log.info(
3730                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3731                    )
3732
3733                    annotation_infos = ",".join(annotation_list)
3734
3735                    if annotation_infos != "":
3736
3737                        # Protect header for bcftools (remove "#CHROM" and variants line)
3738                        log.debug("Protect Header file - remove #CHROM line if exists")
3739                        tmp_header_vcf = NamedTemporaryFile(
3740                            prefix=self.get_prefix(),
3741                            dir=self.get_tmp_dir(),
3742                            suffix=".hdr",
3743                            delete=False,
3744                        )
3745                        tmp_header_vcf_name = tmp_header_vcf.name
3746                        tmp_files.append(tmp_header_vcf_name)
3747                        # Command
3748                        if db_hdr_file.endswith(".gz"):
3749                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3750                        else:
3751                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3752                        # Run
3753                        run_parallel_commands([command_extract_header], 1)
3754
3755                        # Find chomosomes
3756                        log.debug("Find chromosomes ")
3757                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
3758                        sql_query_chromosomes_df = self.get_query_to_df(
3759                            sql_query_chromosomes
3760                        )
3761                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
3762
3763                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
3764
3765                        # BED columns in the annotation file
3766                        if db_file_type in ["bed"]:
3767                            annotation_infos = "CHROM,POS,POS," + annotation_infos
3768
3769                        for chrom in chomosomes_list:
3770
3771                            # Create BED on initial VCF
3772                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
3773                            tmp_bed = NamedTemporaryFile(
3774                                prefix=self.get_prefix(),
3775                                dir=self.get_tmp_dir(),
3776                                suffix=".bed",
3777                                delete=False,
3778                            )
3779                            tmp_bed_name = tmp_bed.name
3780                            tmp_files.append(tmp_bed_name)
3781
3782                            # Detecte regions
3783                            log.debug(
3784                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
3785                            )
3786                            window = 1000000
3787                            sql_query_intervals_for_bed = f"""
3788                                SELECT  \"#CHROM\",
3789                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
3790                                        \"POS\"+{window}
3791                                FROM {table_variants} as table_variants
3792                                WHERE table_variants.\"#CHROM\" = '{chrom}'
3793                            """
3794                            regions = self.conn.execute(
3795                                sql_query_intervals_for_bed
3796                            ).fetchall()
3797                            merged_regions = merge_regions(regions)
3798                            log.debug(
3799                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
3800                            )
3801
3802                            header = ["#CHROM", "START", "END"]
3803                            with open(tmp_bed_name, "w") as f:
3804                                # Write the header with tab delimiter
3805                                f.write("\t".join(header) + "\n")
3806                                for d in merged_regions:
3807                                    # Write each data row with tab delimiter
3808                                    f.write("\t".join(map(str, d)) + "\n")
3809
3810                            # Tmp files
3811                            tmp_annotation_vcf = NamedTemporaryFile(
3812                                prefix=self.get_prefix(),
3813                                dir=self.get_tmp_dir(),
3814                                suffix=".vcf.gz",
3815                                delete=False,
3816                            )
3817                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
3818                            tmp_files.append(tmp_annotation_vcf_name)
3819                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
3820                            tmp_annotation_vcf_name_err = (
3821                                tmp_annotation_vcf_name + ".err"
3822                            )
3823                            err_files.append(tmp_annotation_vcf_name_err)
3824
3825                            # Annotate Command
3826                            log.debug(
3827                                f"Annotation '{annotation}' - add bcftools command"
3828                            )
3829
3830                            # Command
3831                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3832
3833                            # Add command
3834                            commands.append(command_annotate)
3835
3836            # if some commands
3837            if commands:
3838
3839                # Export VCF file
3840                self.export_variant_vcf(
3841                    vcf_file=tmp_vcf_name,
3842                    remove_info=True,
3843                    add_samples=False,
3844                    index=True,
3845                )
3846
3847                # Threads
3848                # calculate threads for annotated commands
3849                if commands:
3850                    threads_bcftools_annotate = round(threads / len(commands))
3851                else:
3852                    threads_bcftools_annotate = 1
3853
3854                if not threads_bcftools_annotate:
3855                    threads_bcftools_annotate = 1
3856
3857                # Add threads option to bcftools commands
3858                if threads_bcftools_annotate > 1:
3859                    commands_threaded = []
3860                    for command in commands:
3861                        commands_threaded.append(
3862                            command.replace(
3863                                f"{bcftools_bin_command} annotate ",
3864                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
3865                            )
3866                        )
3867                    commands = commands_threaded
3868
3869                # Command annotation multithreading
3870                log.debug(f"Annotation - Annotation commands: " + str(commands))
3871                log.info(
3872                    f"Annotation - Annotation multithreaded in "
3873                    + str(len(commands))
3874                    + " commands"
3875                )
3876
3877                run_parallel_commands(commands, threads)
3878
3879                # Merge
3880                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
3881
3882                if tmp_ann_vcf_list_cmd:
3883
3884                    # Tmp file
3885                    tmp_annotate_vcf = NamedTemporaryFile(
3886                        prefix=self.get_prefix(),
3887                        dir=self.get_tmp_dir(),
3888                        suffix=".vcf.gz",
3889                        delete=True,
3890                    )
3891                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
3892                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
3893                    err_files.append(tmp_annotate_vcf_name_err)
3894
3895                    # Tmp file remove command
3896                    tmp_files_remove_command = ""
3897                    if tmp_files:
3898                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
3899
3900                    # Command merge
3901                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
3902                    log.info(
3903                        f"Annotation - Annotation merging "
3904                        + str(len(commands))
3905                        + " annotated files"
3906                    )
3907                    log.debug(f"Annotation - merge command: {merge_command}")
3908                    run_parallel_commands([merge_command], 1)
3909
3910                    # Error messages
3911                    log.info(f"Error/Warning messages:")
3912                    error_message_command_all = []
3913                    error_message_command_warning = []
3914                    error_message_command_err = []
3915                    for err_file in err_files:
3916                        with open(err_file, "r") as f:
3917                            for line in f:
3918                                message = line.strip()
3919                                error_message_command_all.append(message)
3920                                if line.startswith("[W::"):
3921                                    error_message_command_warning.append(message)
3922                                if line.startswith("[E::"):
3923                                    error_message_command_err.append(
3924                                        f"{err_file}: " + message
3925                                    )
3926                    # log info
3927                    for message in list(
3928                        set(error_message_command_err + error_message_command_warning)
3929                    ):
3930                        log.info(f"   {message}")
3931                    # debug info
3932                    for message in list(set(error_message_command_all)):
3933                        log.debug(f"   {message}")
3934                    # failed
3935                    if len(error_message_command_err):
3936                        log.error("Annotation failed: Error in commands")
3937                        raise ValueError("Annotation failed: Error in commands")
3938
3939                    # Update variants
3940                    log.info(f"Annotation - Updating...")
3941                    self.update_from_vcf(tmp_annotate_vcf_name)

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_exomiser(self, threads: int = None) -> None:
3943    def annotation_exomiser(self, threads: int = None) -> None:
3944        """
3945        This function annotate with Exomiser
3946
3947        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
3948        - "analysis" (dict/file):
3949            Full analysis dictionnary parameters (see Exomiser docs).
3950            Either a dict, or a file in JSON or YAML format.
3951            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
3952            Default : None
3953        - "preset" (string):
3954            Analysis preset (available in config folder).
3955            Used if no full "analysis" is provided.
3956            Default: "exome"
3957        - "phenopacket" (dict/file):
3958            Samples and phenotipic features parameters (see Exomiser docs).
3959            Either a dict, or a file in JSON or YAML format.
3960            Default: None
3961        - "subject" (dict):
3962            Sample parameters (see Exomiser docs).
3963            Example:
3964                "subject":
3965                    {
3966                        "id": "ISDBM322017",
3967                        "sex": "FEMALE"
3968                    }
3969            Default: None
3970        - "sample" (string):
3971            Sample name to construct "subject" section:
3972                "subject":
3973                    {
3974                        "id": "<sample>",
3975                        "sex": "UNKNOWN_SEX"
3976                    }
3977            Default: None
3978        - "phenotypicFeatures" (dict)
3979            Phenotypic features to construct "subject" section.
3980            Example:
3981                "phenotypicFeatures":
3982                    [
3983                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
3984                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
3985                    ]
3986        - "hpo" (list)
3987            List of HPO ids as phenotypic features.
3988            Example:
3989                "hpo": ['0001156', '0001363', '0011304', '0010055']
3990            Default: []
3991        - "outputOptions" (dict):
3992            Output options (see Exomiser docs).
3993            Default:
3994                "output_options" =
3995                    {
3996                        "outputContributingVariantsOnly": False,
3997                        "numGenes": 0,
3998                        "outputFormats": ["TSV_VARIANT", "VCF"]
3999                    }
4000        - "transcript_source" (string):
4001            Transcript source (either "refseq", "ucsc", "ensembl")
4002            Default: "refseq"
4003        - "exomiser_to_info" (boolean):
4004            Add exomiser TSV file columns as INFO fields in VCF.
4005            Default: False
4006        - "release" (string):
4007            Exomise database release.
4008            If not exists, database release will be downloaded (take a while).
4009            Default: None (provided by application.properties configuration file)
4010        - "exomiser_application_properties" (file):
4011            Exomiser configuration file (see Exomiser docs).
4012            Useful to automatically download databases (especially for specific genome databases).
4013
4014        Notes:
4015        - If no sample in parameters, first sample in VCF will be chosen
4016        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
4017
4018        :param threads: The number of threads to use
4019        :return: None.
4020        """
4021
4022        # DEBUG
4023        log.debug("Start annotation with Exomiser databases")
4024
4025        # Threads
4026        if not threads:
4027            threads = self.get_threads()
4028        log.debug("Threads: " + str(threads))
4029
4030        # Config
4031        config = self.get_config()
4032        log.debug("Config: " + str(config))
4033
4034        # Config - Folders - Databases
4035        databases_folders = (
4036            config.get("folders", {})
4037            .get("databases", {})
4038            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
4039        )
4040        databases_folders = full_path(databases_folders)
4041        if not os.path.exists(databases_folders):
4042            log.error(f"Databases annotations: {databases_folders} NOT found")
4043        log.debug("Databases annotations: " + str(databases_folders))
4044
4045        # Config - Exomiser
4046        exomiser_bin_command = get_bin_command(
4047            bin="exomiser-cli*.jar",
4048            tool="exomiser",
4049            bin_type="jar",
4050            config=config,
4051            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
4052        )
4053        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
4054        if not exomiser_bin_command:
4055            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
4056            log.error(msg_err)
4057            raise ValueError(msg_err)
4058
4059        # Param
4060        param = self.get_param()
4061        log.debug("Param: " + str(param))
4062
4063        # Param - Exomiser
4064        param_exomiser = param.get("annotation", {}).get("exomiser", {})
4065        log.debug(f"Param Exomiser: {param_exomiser}")
4066
4067        # Param - Assembly
4068        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4069        log.debug("Assembly: " + str(assembly))
4070
4071        # Data
4072        table_variants = self.get_table_variants()
4073
4074        # Check if not empty
4075        log.debug("Check if not empty")
4076        sql_query_chromosomes = (
4077            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4078        )
4079        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4080            log.info(f"VCF empty")
4081            return False
4082
4083        # VCF header
4084        vcf_reader = self.get_header()
4085        log.debug("Initial header: " + str(vcf_reader.infos))
4086
4087        # Samples
4088        samples = self.get_header_sample_list()
4089        if not samples:
4090            log.error("No Samples in VCF")
4091            return False
4092        log.debug(f"Samples: {samples}")
4093
4094        # Memory limit
4095        memory_limit = self.get_memory("8G")
4096        log.debug(f"memory_limit: {memory_limit}")
4097
4098        # Exomiser java options
4099        exomiser_java_options = (
4100            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4101        )
4102        log.debug(f"Exomiser java options: {exomiser_java_options}")
4103
4104        # Download Exomiser (if not exists)
4105        exomiser_release = param_exomiser.get("release", None)
4106        exomiser_application_properties = param_exomiser.get(
4107            "exomiser_application_properties", None
4108        )
4109        databases_download_exomiser(
4110            assemblies=[assembly],
4111            exomiser_folder=databases_folders,
4112            exomiser_release=exomiser_release,
4113            exomiser_phenotype_release=exomiser_release,
4114            exomiser_application_properties=exomiser_application_properties,
4115        )
4116
4117        # Force annotation
4118        force_update_annotation = True
4119
4120        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4121            log.debug("Start annotation Exomiser")
4122
4123            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4124
4125                # tmp_dir = "/tmp/exomiser"
4126
4127                ### ANALYSIS ###
4128                ################
4129
4130                # Create analysis.json through analysis dict
4131                # either analysis in param or by default
4132                # depending on preset exome/genome)
4133
4134                # Init analysis dict
4135                param_exomiser_analysis_dict = {}
4136
4137                # analysis from param
4138                param_exomiser_analysis = param_exomiser.get("analysis", {})
4139                param_exomiser_analysis = full_path(param_exomiser_analysis)
4140
4141                # If analysis in param -> load anlaysis json
4142                if param_exomiser_analysis:
4143
4144                    # If param analysis is a file and exists
4145                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4146                        param_exomiser_analysis
4147                    ):
4148                        # Load analysis file into analysis dict (either yaml or json)
4149                        with open(param_exomiser_analysis) as json_file:
4150                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4151
4152                    # If param analysis is a dict
4153                    elif isinstance(param_exomiser_analysis, dict):
4154                        # Load analysis dict into analysis dict (either yaml or json)
4155                        param_exomiser_analysis_dict = param_exomiser_analysis
4156
4157                    # Error analysis type
4158                    else:
4159                        log.error(f"Analysis type unknown. Check param file.")
4160                        raise ValueError(f"Analysis type unknown. Check param file.")
4161
4162                # Case no input analysis config file/dict
4163                # Use preset (exome/genome) to open default config file
4164                if not param_exomiser_analysis_dict:
4165
4166                    # default preset
4167                    default_preset = "exome"
4168
4169                    # Get param preset or default preset
4170                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4171
4172                    # Try to find if preset is a file
4173                    if os.path.exists(param_exomiser_preset):
4174                        # Preset file is provided in full path
4175                        param_exomiser_analysis_default_config_file = (
4176                            param_exomiser_preset
4177                        )
4178                    # elif os.path.exists(full_path(param_exomiser_preset)):
4179                    #     # Preset file is provided in full path
4180                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4181                    elif os.path.exists(
4182                        os.path.join(folder_config, param_exomiser_preset)
4183                    ):
4184                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4185                        param_exomiser_analysis_default_config_file = os.path.join(
4186                            folder_config, param_exomiser_preset
4187                        )
4188                    else:
4189                        # Construct preset file
4190                        param_exomiser_analysis_default_config_file = os.path.join(
4191                            folder_config,
4192                            f"preset-{param_exomiser_preset}-analysis.json",
4193                        )
4194
4195                    # If preset file exists
4196                    param_exomiser_analysis_default_config_file = full_path(
4197                        param_exomiser_analysis_default_config_file
4198                    )
4199                    if os.path.exists(param_exomiser_analysis_default_config_file):
4200                        # Load prest file into analysis dict (either yaml or json)
4201                        with open(
4202                            param_exomiser_analysis_default_config_file
4203                        ) as json_file:
4204                            # param_exomiser_analysis_dict[""] = json.load(json_file)
4205                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4206                                json_file
4207                            )
4208
4209                    # Error preset file
4210                    else:
4211                        log.error(
4212                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4213                        )
4214                        raise ValueError(
4215                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4216                        )
4217
4218                # If no analysis dict created
4219                if not param_exomiser_analysis_dict:
4220                    log.error(f"No analysis config")
4221                    raise ValueError(f"No analysis config")
4222
4223                # Log
4224                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4225
4226                ### PHENOPACKET ###
4227                ###################
4228
4229                # If no PhenoPacket in analysis dict -> check in param
4230                if "phenopacket" not in param_exomiser_analysis_dict:
4231
4232                    # If PhenoPacket in param -> load anlaysis json
4233                    if param_exomiser.get("phenopacket", None):
4234
4235                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4236                        param_exomiser_phenopacket = full_path(
4237                            param_exomiser_phenopacket
4238                        )
4239
4240                        # If param phenopacket is a file and exists
4241                        if isinstance(
4242                            param_exomiser_phenopacket, str
4243                        ) and os.path.exists(param_exomiser_phenopacket):
4244                            # Load phenopacket file into analysis dict (either yaml or json)
4245                            with open(param_exomiser_phenopacket) as json_file:
4246                                param_exomiser_analysis_dict["phenopacket"] = (
4247                                    yaml.safe_load(json_file)
4248                                )
4249
4250                        # If param phenopacket is a dict
4251                        elif isinstance(param_exomiser_phenopacket, dict):
4252                            # Load phenopacket dict into analysis dict (either yaml or json)
4253                            param_exomiser_analysis_dict["phenopacket"] = (
4254                                param_exomiser_phenopacket
4255                            )
4256
4257                        # Error phenopacket type
4258                        else:
4259                            log.error(f"Phenopacket type unknown. Check param file.")
4260                            raise ValueError(
4261                                f"Phenopacket type unknown. Check param file."
4262                            )
4263
4264                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4265                if "phenopacket" not in param_exomiser_analysis_dict:
4266
4267                    # Init PhenoPacket
4268                    param_exomiser_analysis_dict["phenopacket"] = {
4269                        "id": "analysis",
4270                        "proband": {},
4271                    }
4272
4273                    ### Add subject ###
4274
4275                    # If subject exists
4276                    param_exomiser_subject = param_exomiser.get("subject", {})
4277
4278                    # If subject not exists -> found sample ID
4279                    if not param_exomiser_subject:
4280
4281                        # Found sample ID in param
4282                        sample = param_exomiser.get("sample", None)
4283
4284                        # Find sample ID (first sample)
4285                        if not sample:
4286                            sample_list = self.get_header_sample_list()
4287                            if len(sample_list) > 0:
4288                                sample = sample_list[0]
4289                            else:
4290                                log.error(f"No sample found")
4291                                raise ValueError(f"No sample found")
4292
4293                        # Create subject
4294                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4295
4296                    # Add to dict
4297                    param_exomiser_analysis_dict["phenopacket"][
4298                        "subject"
4299                    ] = param_exomiser_subject
4300
4301                    ### Add "phenotypicFeatures" ###
4302
4303                    # If phenotypicFeatures exists
4304                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4305                        "phenotypicFeatures", []
4306                    )
4307
4308                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4309                    if not param_exomiser_phenotypicfeatures:
4310
4311                        # Found HPO in param
4312                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4313
4314                        # Split HPO if list in string format separated by comma
4315                        if isinstance(param_exomiser_hpo, str):
4316                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4317
4318                        # Create HPO list
4319                        for hpo in param_exomiser_hpo:
4320                            hpo_clean = re.sub("[^0-9]", "", hpo)
4321                            param_exomiser_phenotypicfeatures.append(
4322                                {
4323                                    "type": {
4324                                        "id": f"HP:{hpo_clean}",
4325                                        "label": f"HP:{hpo_clean}",
4326                                    }
4327                                }
4328                            )
4329
4330                    # Add to dict
4331                    param_exomiser_analysis_dict["phenopacket"][
4332                        "phenotypicFeatures"
4333                    ] = param_exomiser_phenotypicfeatures
4334
4335                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4336                    if not param_exomiser_phenotypicfeatures:
4337                        for step in param_exomiser_analysis_dict.get(
4338                            "analysis", {}
4339                        ).get("steps", []):
4340                            if "hiPhivePrioritiser" in step:
4341                                param_exomiser_analysis_dict.get("analysis", {}).get(
4342                                    "steps", []
4343                                ).remove(step)
4344
4345                ### Add Input File ###
4346
4347                # Initial file name and htsFiles
4348                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4349                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4350                    {
4351                        "uri": tmp_vcf_name,
4352                        "htsFormat": "VCF",
4353                        "genomeAssembly": assembly,
4354                    }
4355                ]
4356
4357                ### Add metaData ###
4358
4359                # If metaData not in analysis dict
4360                if "metaData" not in param_exomiser_analysis_dict:
4361                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4362                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4363                        "createdBy": "howard",
4364                        "phenopacketSchemaVersion": 1,
4365                    }
4366
4367                ### OutputOptions ###
4368
4369                # Init output result folder
4370                output_results = os.path.join(tmp_dir, "results")
4371
4372                # If no outputOptions in analysis dict
4373                if "outputOptions" not in param_exomiser_analysis_dict:
4374
4375                    # default output formats
4376                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4377
4378                    # Get outputOptions in param
4379                    output_options = param_exomiser.get("outputOptions", None)
4380
4381                    # If no output_options in param -> check
4382                    if not output_options:
4383                        output_options = {
4384                            "outputContributingVariantsOnly": False,
4385                            "numGenes": 0,
4386                            "outputFormats": defaut_output_formats,
4387                        }
4388
4389                    # Replace outputDirectory in output options
4390                    output_options["outputDirectory"] = output_results
4391                    output_options["outputFileName"] = "howard"
4392
4393                    # Add outputOptions in analysis dict
4394                    param_exomiser_analysis_dict["outputOptions"] = output_options
4395
4396                else:
4397
4398                    # Replace output_results and output format (if exists in param)
4399                    param_exomiser_analysis_dict["outputOptions"][
4400                        "outputDirectory"
4401                    ] = output_results
4402                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4403                        list(
4404                            set(
4405                                param_exomiser_analysis_dict.get(
4406                                    "outputOptions", {}
4407                                ).get("outputFormats", [])
4408                                + ["TSV_VARIANT", "VCF"]
4409                            )
4410                        )
4411                    )
4412
4413                # log
4414                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4415
4416                ### ANALYSIS FILE ###
4417                #####################
4418
4419                ### Full JSON analysis config file ###
4420
4421                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4422                with open(exomiser_analysis, "w") as fp:
4423                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4424
4425                ### SPLIT analysis and sample config files
4426
4427                # Splitted analysis dict
4428                param_exomiser_analysis_dict_for_split = (
4429                    param_exomiser_analysis_dict.copy()
4430                )
4431
4432                # Phenopacket JSON file
4433                exomiser_analysis_phenopacket = os.path.join(
4434                    tmp_dir, "analysis_phenopacket.json"
4435                )
4436                with open(exomiser_analysis_phenopacket, "w") as fp:
4437                    json.dump(
4438                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4439                        fp,
4440                        indent=4,
4441                    )
4442
4443                # Analysis JSON file without Phenopacket parameters
4444                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4445                exomiser_analysis_analysis = os.path.join(
4446                    tmp_dir, "analysis_analysis.json"
4447                )
4448                with open(exomiser_analysis_analysis, "w") as fp:
4449                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4450
4451                ### INITAL VCF file ###
4452                #######################
4453
4454                ### Create list of samples to use and include inti initial VCF file ####
4455
4456                # Subject (main sample)
4457                # Get sample ID in analysis dict
4458                sample_subject = (
4459                    param_exomiser_analysis_dict.get("phenopacket", {})
4460                    .get("subject", {})
4461                    .get("id", None)
4462                )
4463                sample_proband = (
4464                    param_exomiser_analysis_dict.get("phenopacket", {})
4465                    .get("proband", {})
4466                    .get("subject", {})
4467                    .get("id", None)
4468                )
4469                sample = []
4470                if sample_subject:
4471                    sample.append(sample_subject)
4472                if sample_proband:
4473                    sample.append(sample_proband)
4474
4475                # Get sample ID within Pedigree
4476                pedigree_persons_list = (
4477                    param_exomiser_analysis_dict.get("phenopacket", {})
4478                    .get("pedigree", {})
4479                    .get("persons", {})
4480                )
4481
4482                # Create list with all sample ID in pedigree (if exists)
4483                pedigree_persons = []
4484                for person in pedigree_persons_list:
4485                    pedigree_persons.append(person.get("individualId"))
4486
4487                # Concat subject sample ID and samples ID in pedigreesamples
4488                samples = list(set(sample + pedigree_persons))
4489
4490                # Check if sample list is not empty
4491                if not samples:
4492                    log.error(f"No samples found")
4493                    raise ValueError(f"No samples found")
4494
4495                # Create VCF with sample (either sample in param or first one by default)
4496                # Export VCF file
4497                self.export_variant_vcf(
4498                    vcf_file=tmp_vcf_name,
4499                    remove_info=True,
4500                    add_samples=True,
4501                    list_samples=samples,
4502                    index=False,
4503                )
4504
4505                ### Execute Exomiser ###
4506                ########################
4507
4508                # Init command
4509                exomiser_command = ""
4510
4511                # Command exomiser options
4512                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
4513
4514                # Release
4515                exomiser_release = param_exomiser.get("release", None)
4516                if exomiser_release:
4517                    # phenotype data version
4518                    exomiser_options += (
4519                        f" --exomiser.phenotype.data-version={exomiser_release} "
4520                    )
4521                    # data version
4522                    exomiser_options += (
4523                        f" --exomiser.{assembly}.data-version={exomiser_release} "
4524                    )
4525                    # variant white list
4526                    variant_white_list_file = (
4527                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
4528                    )
4529                    if os.path.exists(
4530                        os.path.join(
4531                            databases_folders, assembly, variant_white_list_file
4532                        )
4533                    ):
4534                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
4535
4536                # transcript_source
4537                transcript_source = param_exomiser.get(
4538                    "transcript_source", None
4539                )  # ucsc, refseq, ensembl
4540                if transcript_source:
4541                    exomiser_options += (
4542                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
4543                    )
4544
4545                # If analysis contain proband param
4546                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
4547                    "proband", {}
4548                ):
4549                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
4550
4551                # If no proband (usually uniq sample)
4552                else:
4553                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
4554
4555                # Log
4556                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
4557
4558                # Run command
4559                result = subprocess.call(
4560                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
4561                )
4562                if result:
4563                    log.error("Exomiser command failed")
4564                    raise ValueError("Exomiser command failed")
4565
4566                ### RESULTS ###
4567                ###############
4568
4569                ### Annotate with TSV fields ###
4570
4571                # Init result tsv file
4572                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
4573
4574                # Init result tsv file
4575                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
4576
4577                # Parse TSV file and explode columns in INFO field
4578                if exomiser_to_info and os.path.exists(output_results_tsv):
4579
4580                    # Log
4581                    log.debug("Exomiser columns to VCF INFO field")
4582
4583                    # Retrieve columns and types
4584                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
4585                    output_results_tsv_df = self.get_query_to_df(query)
4586                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
4587
4588                    # Init concat fields for update
4589                    sql_query_update_concat_fields = []
4590
4591                    # Fields to avoid
4592                    fields_to_avoid = [
4593                        "CONTIG",
4594                        "START",
4595                        "END",
4596                        "REF",
4597                        "ALT",
4598                        "QUAL",
4599                        "FILTER",
4600                        "GENOTYPE",
4601                    ]
4602
4603                    # List all columns to add into header
4604                    for header_column in output_results_tsv_columns:
4605
4606                        # If header column is enable
4607                        if header_column not in fields_to_avoid:
4608
4609                            # Header info type
4610                            header_info_type = "String"
4611                            header_column_df = output_results_tsv_df[header_column]
4612                            header_column_df_dtype = header_column_df.dtype
4613                            if header_column_df_dtype == object:
4614                                if (
4615                                    pd.to_numeric(header_column_df, errors="coerce")
4616                                    .notnull()
4617                                    .all()
4618                                ):
4619                                    header_info_type = "Float"
4620                            else:
4621                                header_info_type = "Integer"
4622
4623                            # Header info
4624                            characters_to_validate = ["-"]
4625                            pattern = "[" + "".join(characters_to_validate) + "]"
4626                            header_info_name = re.sub(
4627                                pattern,
4628                                "_",
4629                                f"Exomiser_{header_column}".replace("#", ""),
4630                            )
4631                            header_info_number = "."
4632                            header_info_description = (
4633                                f"Exomiser {header_column} annotation"
4634                            )
4635                            header_info_source = "Exomiser"
4636                            header_info_version = "unknown"
4637                            header_info_code = CODE_TYPE_MAP[header_info_type]
4638                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
4639                                header_info_name,
4640                                header_info_number,
4641                                header_info_type,
4642                                header_info_description,
4643                                header_info_source,
4644                                header_info_version,
4645                                header_info_code,
4646                            )
4647
4648                            # Add field to add for update to concat fields
4649                            sql_query_update_concat_fields.append(
4650                                f"""
4651                                CASE
4652                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
4653                                    THEN concat(
4654                                        '{header_info_name}=',
4655                                        table_parquet."{header_column}",
4656                                        ';'
4657                                        )
4658
4659                                    ELSE ''
4660                                END
4661                            """
4662                            )
4663
4664                    # Update query
4665                    sql_query_update = f"""
4666                        UPDATE {table_variants} as table_variants
4667                            SET INFO = concat(
4668                                            CASE
4669                                                WHEN INFO NOT IN ('', '.')
4670                                                THEN INFO
4671                                                ELSE ''
4672                                            END,
4673                                            CASE
4674                                                WHEN table_variants.INFO NOT IN ('','.')
4675                                                THEN ';'
4676                                                ELSE ''
4677                                            END,
4678                                            (
4679                                            SELECT 
4680                                                concat(
4681                                                    {",".join(sql_query_update_concat_fields)}
4682                                                )
4683                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
4684                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
4685                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
4686                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
4687                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
4688                                            )
4689                                        )
4690                            ;
4691                        """
4692
4693                    # Update
4694                    self.conn.execute(sql_query_update)
4695
4696                ### Annotate with VCF INFO field ###
4697
4698                # Init result VCF file
4699                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
4700
4701                # If VCF exists
4702                if os.path.exists(output_results_vcf):
4703
4704                    # Log
4705                    log.debug("Exomiser result VCF update variants")
4706
4707                    # Find Exomiser INFO field annotation in header
4708                    with gzip.open(output_results_vcf, "rt") as f:
4709                        header_list = self.read_vcf_header(f)
4710                    exomiser_vcf_header = vcf.Reader(
4711                        io.StringIO("\n".join(header_list))
4712                    )
4713
4714                    # Add annotation INFO field to header
4715                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
4716
4717                    # Update variants with VCF
4718                    self.update_from_vcf(output_results_vcf)
4719
4720        return True

This function annotate with Exomiser

This function uses args as parameters, in section "annotation" -> "exomiser", with sections:

  • "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
  • "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
  • "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
  • "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
  • "sample" (string): Sample name to construct "subject" section: "subject": { "id": "", "sex": "UNKNOWN_SEX" } Default: None
  • "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
  • "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
  • "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
  • "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
  • "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
  • "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
  • "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).

Notes:

  • If no sample in parameters, first sample in VCF will be chosen
  • If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
  • threads: The number of threads to use
Returns

None.

def annotation_snpeff(self, threads: int = None) -> None:
4722    def annotation_snpeff(self, threads: int = None) -> None:
4723        """
4724        This function annotate with snpEff
4725
4726        :param threads: The number of threads to use
4727        :return: the value of the variable "return_value".
4728        """
4729
4730        # DEBUG
4731        log.debug("Start annotation with snpeff databases")
4732
4733        # Threads
4734        if not threads:
4735            threads = self.get_threads()
4736        log.debug("Threads: " + str(threads))
4737
4738        # DEBUG
4739        delete_tmp = True
4740        if self.get_config().get("verbosity", "warning") in ["debug"]:
4741            delete_tmp = False
4742            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4743
4744        # Config
4745        config = self.get_config()
4746        log.debug("Config: " + str(config))
4747
4748        # Config - Folders - Databases
4749        databases_folders = (
4750            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
4751        )
4752        log.debug("Databases annotations: " + str(databases_folders))
4753
4754        # # Config - Java
4755        # java_bin = get_bin(
4756        #     tool="java",
4757        #     bin="java",
4758        #     bin_type="bin",
4759        #     config=config,
4760        #     default_folder="/usr/bin",
4761        # )
4762        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
4763        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
4764        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
4765
4766        # # Config - snpEff bin
4767        # snpeff_jar = get_bin(
4768        #     tool="snpeff",
4769        #     bin="snpEff.jar",
4770        #     bin_type="jar",
4771        #     config=config,
4772        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4773        # )
4774        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
4775        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4776        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4777
4778        # Config - snpEff bin command
4779        snpeff_bin_command = get_bin_command(
4780            bin="snpEff.jar",
4781            tool="snpeff",
4782            bin_type="jar",
4783            config=config,
4784            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4785        )
4786        if not snpeff_bin_command:
4787            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
4788            log.error(msg_err)
4789            raise ValueError(msg_err)
4790
4791        # Config - snpEff databases
4792        snpeff_databases = (
4793            config.get("folders", {})
4794            .get("databases", {})
4795            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
4796        )
4797        snpeff_databases = full_path(snpeff_databases)
4798        if snpeff_databases is not None and snpeff_databases != "":
4799            log.debug(f"Create snpEff databases folder")
4800            if not os.path.exists(snpeff_databases):
4801                os.makedirs(snpeff_databases)
4802
4803        # Param
4804        param = self.get_param()
4805        log.debug("Param: " + str(param))
4806
4807        # Param
4808        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
4809        log.debug("Options: " + str(options))
4810
4811        # Param - Assembly
4812        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4813
4814        # Param - Options
4815        snpeff_options = (
4816            param.get("annotation", {}).get("snpeff", {}).get("options", "")
4817        )
4818        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
4819        snpeff_csvstats = (
4820            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
4821        )
4822        if snpeff_stats:
4823            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
4824            snpeff_stats = full_path(snpeff_stats)
4825            snpeff_options += f" -stats {snpeff_stats}"
4826        if snpeff_csvstats:
4827            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
4828            snpeff_csvstats = full_path(snpeff_csvstats)
4829            snpeff_options += f" -csvStats {snpeff_csvstats}"
4830
4831        # Data
4832        table_variants = self.get_table_variants()
4833
4834        # Check if not empty
4835        log.debug("Check if not empty")
4836        sql_query_chromosomes = (
4837            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4838        )
4839        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
4840        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4841            log.info(f"VCF empty")
4842            return
4843
4844        # Export in VCF
4845        log.debug("Create initial file to annotate")
4846        tmp_vcf = NamedTemporaryFile(
4847            prefix=self.get_prefix(),
4848            dir=self.get_tmp_dir(),
4849            suffix=".vcf.gz",
4850            delete=True,
4851        )
4852        tmp_vcf_name = tmp_vcf.name
4853
4854        # VCF header
4855        vcf_reader = self.get_header()
4856        log.debug("Initial header: " + str(vcf_reader.infos))
4857
4858        # Existing annotations
4859        for vcf_annotation in self.get_header().infos:
4860
4861            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4862            log.debug(
4863                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4864            )
4865
4866        # Memory limit
4867        # if config.get("memory", None):
4868        #     memory_limit = config.get("memory", "8G")
4869        # else:
4870        #     memory_limit = "8G"
4871        memory_limit = self.get_memory("8G")
4872        log.debug(f"memory_limit: {memory_limit}")
4873
4874        # snpEff java options
4875        snpeff_java_options = (
4876            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4877        )
4878        log.debug(f"Exomiser java options: {snpeff_java_options}")
4879
4880        force_update_annotation = True
4881
4882        if "ANN" not in self.get_header().infos or force_update_annotation:
4883
4884            # Check snpEff database
4885            log.debug(f"Check snpEff databases {[assembly]}")
4886            databases_download_snpeff(
4887                folder=snpeff_databases, assemblies=[assembly], config=config
4888            )
4889
4890            # Export VCF file
4891            self.export_variant_vcf(
4892                vcf_file=tmp_vcf_name,
4893                remove_info=True,
4894                add_samples=False,
4895                index=True,
4896            )
4897
4898            # Tmp file
4899            err_files = []
4900            tmp_annotate_vcf = NamedTemporaryFile(
4901                prefix=self.get_prefix(),
4902                dir=self.get_tmp_dir(),
4903                suffix=".vcf",
4904                delete=False,
4905            )
4906            tmp_annotate_vcf_name = tmp_annotate_vcf.name
4907            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4908            err_files.append(tmp_annotate_vcf_name_err)
4909
4910            # Command
4911            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
4912            log.debug(f"Annotation - snpEff command: {snpeff_command}")
4913            run_parallel_commands([snpeff_command], 1)
4914
4915            # Error messages
4916            log.info(f"Error/Warning messages:")
4917            error_message_command_all = []
4918            error_message_command_warning = []
4919            error_message_command_err = []
4920            for err_file in err_files:
4921                with open(err_file, "r") as f:
4922                    for line in f:
4923                        message = line.strip()
4924                        error_message_command_all.append(message)
4925                        if line.startswith("[W::"):
4926                            error_message_command_warning.append(message)
4927                        if line.startswith("[E::"):
4928                            error_message_command_err.append(f"{err_file}: " + message)
4929            # log info
4930            for message in list(
4931                set(error_message_command_err + error_message_command_warning)
4932            ):
4933                log.info(f"   {message}")
4934            # debug info
4935            for message in list(set(error_message_command_all)):
4936                log.debug(f"   {message}")
4937            # failed
4938            if len(error_message_command_err):
4939                log.error("Annotation failed: Error in commands")
4940                raise ValueError("Annotation failed: Error in commands")
4941
4942            # Find annotation in header
4943            with open(tmp_annotate_vcf_name, "rt") as f:
4944                header_list = self.read_vcf_header(f)
4945            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
4946
4947            for ann in annovar_vcf_header.infos:
4948                if ann not in self.get_header().infos:
4949                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
4950
4951            # Update variants
4952            log.info(f"Annotation - Updating...")
4953            self.update_from_vcf(tmp_annotate_vcf_name)
4954
4955        else:
4956            if "ANN" in self.get_header().infos:
4957                log.debug(f"Existing snpEff annotations in VCF")
4958            if force_update_annotation:
4959                log.debug(f"Existing snpEff annotations in VCF - annotation forced")

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def annotation_annovar(self, threads: int = None) -> None:
4961    def annotation_annovar(self, threads: int = None) -> None:
4962        """
4963        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
4964        annotations
4965
4966        :param threads: number of threads to use
4967        :return: the value of the variable "return_value".
4968        """
4969
4970        # DEBUG
4971        log.debug("Start annotation with Annovar databases")
4972
4973        # Threads
4974        if not threads:
4975            threads = self.get_threads()
4976        log.debug("Threads: " + str(threads))
4977
4978        # Tmp en Err files
4979        tmp_files = []
4980        err_files = []
4981
4982        # DEBUG
4983        delete_tmp = True
4984        if self.get_config().get("verbosity", "warning") in ["debug"]:
4985            delete_tmp = False
4986            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4987
4988        # Config
4989        config = self.get_config()
4990        log.debug("Config: " + str(config))
4991
4992        # Config - Folders - Databases
4993        databases_folders = (
4994            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
4995        )
4996        log.debug("Databases annotations: " + str(databases_folders))
4997
4998        # Config - annovar bin command
4999        annovar_bin_command = get_bin_command(
5000            bin="table_annovar.pl",
5001            tool="annovar",
5002            bin_type="perl",
5003            config=config,
5004            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
5005        )
5006        if not annovar_bin_command:
5007            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
5008            log.error(msg_err)
5009            raise ValueError(msg_err)
5010
5011        # Config - BCFTools bin command
5012        bcftools_bin_command = get_bin_command(
5013            bin="bcftools",
5014            tool="bcftools",
5015            bin_type="bin",
5016            config=config,
5017            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
5018        )
5019        if not bcftools_bin_command:
5020            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
5021            log.error(msg_err)
5022            raise ValueError(msg_err)
5023
5024        # Config - annovar databases
5025        annovar_databases = (
5026            config.get("folders", {})
5027            .get("databases", {})
5028            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
5029        )
5030        annovar_databases = full_path(annovar_databases)
5031        if annovar_databases != "" and not os.path.exists(annovar_databases):
5032            os.makedirs(annovar_databases)
5033
5034        # Param
5035        param = self.get_param()
5036        log.debug("Param: " + str(param))
5037
5038        # Param - options
5039        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
5040        log.debug("Options: " + str(options))
5041
5042        # Param - annotations
5043        annotations = (
5044            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
5045        )
5046        log.debug("Annotations: " + str(annotations))
5047
5048        # Param - Assembly
5049        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5050
5051        # Annovar database assembly
5052        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
5053        if annovar_databases_assembly != "" and not os.path.exists(
5054            annovar_databases_assembly
5055        ):
5056            os.makedirs(annovar_databases_assembly)
5057
5058        # Data
5059        table_variants = self.get_table_variants()
5060
5061        # Check if not empty
5062        log.debug("Check if not empty")
5063        sql_query_chromosomes = (
5064            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5065        )
5066        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
5067        if not sql_query_chromosomes_df["count"][0]:
5068            log.info(f"VCF empty")
5069            return
5070
5071        # VCF header
5072        vcf_reader = self.get_header()
5073        log.debug("Initial header: " + str(vcf_reader.infos))
5074
5075        # Existing annotations
5076        for vcf_annotation in self.get_header().infos:
5077
5078            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5079            log.debug(
5080                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5081            )
5082
5083        force_update_annotation = True
5084
5085        if annotations:
5086
5087            commands = []
5088            tmp_annotates_vcf_name_list = []
5089
5090            # Export in VCF
5091            log.debug("Create initial file to annotate")
5092            tmp_vcf = NamedTemporaryFile(
5093                prefix=self.get_prefix(),
5094                dir=self.get_tmp_dir(),
5095                suffix=".vcf.gz",
5096                delete=False,
5097            )
5098            tmp_vcf_name = tmp_vcf.name
5099            tmp_files.append(tmp_vcf_name)
5100            tmp_files.append(tmp_vcf_name + ".tbi")
5101
5102            # Export VCF file
5103            self.export_variant_vcf(
5104                vcf_file=tmp_vcf_name,
5105                remove_info=".",
5106                add_samples=False,
5107                index=True,
5108            )
5109
5110            # Create file for field rename
5111            log.debug("Create file for field rename")
5112            tmp_rename = NamedTemporaryFile(
5113                prefix=self.get_prefix(),
5114                dir=self.get_tmp_dir(),
5115                suffix=".rename",
5116                delete=False,
5117            )
5118            tmp_rename_name = tmp_rename.name
5119            tmp_files.append(tmp_rename_name)
5120
5121            # Check Annovar database
5122            log.debug(
5123                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5124            )
5125            databases_download_annovar(
5126                folder=annovar_databases,
5127                files=list(annotations.keys()),
5128                assemblies=[assembly],
5129            )
5130
5131            for annotation in annotations:
5132                annotation_fields = annotations[annotation]
5133
5134                if not annotation_fields:
5135                    annotation_fields = {"INFO": None}
5136
5137                log.info(f"Annotations Annovar - database '{annotation}'")
5138                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5139
5140                # Tmp file for annovar
5141                err_files = []
5142                tmp_annotate_vcf_directory = TemporaryDirectory(
5143                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5144                )
5145                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5146                tmp_annotate_vcf_name_annovar = (
5147                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5148                )
5149                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5150                err_files.append(tmp_annotate_vcf_name_err)
5151                tmp_files.append(tmp_annotate_vcf_name_err)
5152
5153                # Tmp file final vcf annotated by annovar
5154                tmp_annotate_vcf = NamedTemporaryFile(
5155                    prefix=self.get_prefix(),
5156                    dir=self.get_tmp_dir(),
5157                    suffix=".vcf.gz",
5158                    delete=False,
5159                )
5160                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5161                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5162                tmp_files.append(tmp_annotate_vcf_name)
5163                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5164
5165                # Number of fields
5166                annotation_list = []
5167                annotation_renamed_list = []
5168
5169                for annotation_field in annotation_fields:
5170
5171                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5172                    annotation_fields_new_name = annotation_fields.get(
5173                        annotation_field, annotation_field
5174                    )
5175                    if not annotation_fields_new_name:
5176                        annotation_fields_new_name = annotation_field
5177
5178                    if (
5179                        force_update_annotation
5180                        or annotation_fields_new_name not in self.get_header().infos
5181                    ):
5182                        annotation_list.append(annotation_field)
5183                        annotation_renamed_list.append(annotation_fields_new_name)
5184                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5185                        log.warning(
5186                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5187                        )
5188
5189                    # Add rename info
5190                    run_parallel_commands(
5191                        [
5192                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5193                        ],
5194                        1,
5195                    )
5196
5197                # log.debug("fields_to_removed: " + str(fields_to_removed))
5198                log.debug("annotation_list: " + str(annotation_list))
5199
5200                # protocol
5201                protocol = annotation
5202
5203                # argument
5204                argument = ""
5205
5206                # operation
5207                operation = "f"
5208                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5209                    "ensGene"
5210                ):
5211                    operation = "g"
5212                    if options.get("genebase", None):
5213                        argument = f"""'{options.get("genebase","")}'"""
5214                elif annotation in ["cytoBand"]:
5215                    operation = "r"
5216
5217                # argument option
5218                argument_option = ""
5219                if argument != "":
5220                    argument_option = " --argument " + argument
5221
5222                # command options
5223                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5224                for option in options:
5225                    if option not in ["genebase"]:
5226                        command_options += f""" --{option}={options[option]}"""
5227
5228                # Command
5229
5230                # Command - Annovar
5231                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5232                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5233
5234                # Command - start pipe
5235                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5236
5237                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5238                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5239
5240                # Command - Special characters (refGene annotation)
5241                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5242
5243                # Command - Clean empty fields (with value ".")
5244                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5245
5246                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5247                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5248                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5249                    # for ann in annotation_renamed_list:
5250                    for ann in annotation_list:
5251                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5252
5253                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5254
5255                # Command - indexing
5256                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5257
5258                log.debug(f"Annotation - Annovar command: {command_annovar}")
5259                run_parallel_commands([command_annovar], 1)
5260
5261                # Error messages
5262                log.info(f"Error/Warning messages:")
5263                error_message_command_all = []
5264                error_message_command_warning = []
5265                error_message_command_err = []
5266                for err_file in err_files:
5267                    with open(err_file, "r") as f:
5268                        for line in f:
5269                            message = line.strip()
5270                            error_message_command_all.append(message)
5271                            if line.startswith("[W::") or line.startswith("WARNING"):
5272                                error_message_command_warning.append(message)
5273                            if line.startswith("[E::") or line.startswith("ERROR"):
5274                                error_message_command_err.append(
5275                                    f"{err_file}: " + message
5276                                )
5277                # log info
5278                for message in list(
5279                    set(error_message_command_err + error_message_command_warning)
5280                ):
5281                    log.info(f"   {message}")
5282                # debug info
5283                for message in list(set(error_message_command_all)):
5284                    log.debug(f"   {message}")
5285                # failed
5286                if len(error_message_command_err):
5287                    log.error("Annotation failed: Error in commands")
5288                    raise ValueError("Annotation failed: Error in commands")
5289
5290            if tmp_annotates_vcf_name_list:
5291
5292                # List of annotated files
5293                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5294
5295                # Tmp file
5296                tmp_annotate_vcf = NamedTemporaryFile(
5297                    prefix=self.get_prefix(),
5298                    dir=self.get_tmp_dir(),
5299                    suffix=".vcf.gz",
5300                    delete=False,
5301                )
5302                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5303                tmp_files.append(tmp_annotate_vcf_name)
5304                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5305                err_files.append(tmp_annotate_vcf_name_err)
5306                tmp_files.append(tmp_annotate_vcf_name_err)
5307
5308                # Command merge
5309                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5310                log.info(
5311                    f"Annotation Annovar - Annotation merging "
5312                    + str(len(tmp_annotates_vcf_name_list))
5313                    + " annotated files"
5314                )
5315                log.debug(f"Annotation - merge command: {merge_command}")
5316                run_parallel_commands([merge_command], 1)
5317
5318                # Find annotation in header
5319                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5320                    header_list = self.read_vcf_header(f)
5321                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5322
5323                for ann in annovar_vcf_header.infos:
5324                    if ann not in self.get_header().infos:
5325                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5326
5327                # Update variants
5328                log.info(f"Annotation Annovar - Updating...")
5329                self.update_from_vcf(tmp_annotate_vcf_name)
5330
5331            # Clean files
5332            # Tmp file remove command
5333            if True:
5334                tmp_files_remove_command = ""
5335                if tmp_files:
5336                    tmp_files_remove_command = " ".join(tmp_files)
5337                clean_command = f" rm -f {tmp_files_remove_command} "
5338                log.debug(f"Annotation Annovar - Annotation cleaning ")
5339                log.debug(f"Annotation - cleaning command: {clean_command}")
5340                run_parallel_commands([clean_command], 1)

It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations

Parameters
  • threads: number of threads to use
Returns

the value of the variable "return_value".

def annotation_parquet(self, threads: int = None) -> None:
5343    def annotation_parquet(self, threads: int = None) -> None:
5344        """
5345        It takes a VCF file, and annotates it with a parquet file
5346
5347        :param threads: number of threads to use for the annotation
5348        :return: the value of the variable "result".
5349        """
5350
5351        # DEBUG
5352        log.debug("Start annotation with parquet databases")
5353
5354        # Threads
5355        if not threads:
5356            threads = self.get_threads()
5357        log.debug("Threads: " + str(threads))
5358
5359        # DEBUG
5360        delete_tmp = True
5361        if self.get_config().get("verbosity", "warning") in ["debug"]:
5362            delete_tmp = False
5363            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5364
5365        # Config
5366        databases_folders = set(
5367            self.get_config()
5368            .get("folders", {})
5369            .get("databases", {})
5370            .get("annotations", ["."])
5371            + self.get_config()
5372            .get("folders", {})
5373            .get("databases", {})
5374            .get("parquet", ["."])
5375        )
5376        log.debug("Databases annotations: " + str(databases_folders))
5377
5378        # Param
5379        annotations = (
5380            self.get_param()
5381            .get("annotation", {})
5382            .get("parquet", {})
5383            .get("annotations", None)
5384        )
5385        log.debug("Annotations: " + str(annotations))
5386
5387        # Assembly
5388        assembly = self.get_param().get(
5389            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5390        )
5391
5392        # Force Update Annotation
5393        force_update_annotation = (
5394            self.get_param()
5395            .get("annotation", {})
5396            .get("options", {})
5397            .get("annotations_update", False)
5398        )
5399        log.debug(f"force_update_annotation={force_update_annotation}")
5400        force_append_annotation = (
5401            self.get_param()
5402            .get("annotation", {})
5403            .get("options", {})
5404            .get("annotations_append", False)
5405        )
5406        log.debug(f"force_append_annotation={force_append_annotation}")
5407
5408        # Data
5409        table_variants = self.get_table_variants()
5410
5411        # Check if not empty
5412        log.debug("Check if not empty")
5413        sql_query_chromosomes_df = self.get_query_to_df(
5414            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5415        )
5416        if not sql_query_chromosomes_df["count"][0]:
5417            log.info(f"VCF empty")
5418            return
5419
5420        # VCF header
5421        vcf_reader = self.get_header()
5422        log.debug("Initial header: " + str(vcf_reader.infos))
5423
5424        # Nb Variants POS
5425        log.debug("NB Variants Start")
5426        nb_variants = self.conn.execute(
5427            f"SELECT count(*) AS count FROM variants"
5428        ).fetchdf()["count"][0]
5429        log.debug("NB Variants Stop")
5430
5431        # Existing annotations
5432        for vcf_annotation in self.get_header().infos:
5433
5434            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5435            log.debug(
5436                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5437            )
5438
5439        # Added columns
5440        added_columns = []
5441
5442        # drop indexes
5443        log.debug(f"Drop indexes...")
5444        self.drop_indexes()
5445
5446        if annotations:
5447
5448            if "ALL" in annotations:
5449
5450                all_param = annotations.get("ALL", {})
5451                all_param_formats = all_param.get("formats", None)
5452                all_param_releases = all_param.get("releases", None)
5453
5454                databases_infos_dict = self.scan_databases(
5455                    database_formats=all_param_formats,
5456                    database_releases=all_param_releases,
5457                )
5458                for database_infos in databases_infos_dict.keys():
5459                    if database_infos not in annotations:
5460                        annotations[database_infos] = {"INFO": None}
5461
5462            for annotation in annotations:
5463
5464                if annotation in ["ALL"]:
5465                    continue
5466
5467                # Annotation Name
5468                annotation_name = os.path.basename(annotation)
5469
5470                # Annotation fields
5471                annotation_fields = annotations[annotation]
5472                if not annotation_fields:
5473                    annotation_fields = {"INFO": None}
5474
5475                log.debug(f"Annotation '{annotation_name}'")
5476                log.debug(
5477                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5478                )
5479
5480                # Create Database
5481                database = Database(
5482                    database=annotation,
5483                    databases_folders=databases_folders,
5484                    assembly=assembly,
5485                )
5486
5487                # Find files
5488                parquet_file = database.get_database()
5489                parquet_hdr_file = database.get_header_file()
5490                parquet_type = database.get_type()
5491
5492                # Check if files exists
5493                if not parquet_file or not parquet_hdr_file:
5494                    log.error("Annotation failed: file not found")
5495                    raise ValueError("Annotation failed: file not found")
5496                else:
5497                    # Get parquet connexion
5498                    parquet_sql_attach = database.get_sql_database_attach(
5499                        output="query"
5500                    )
5501                    if parquet_sql_attach:
5502                        self.conn.execute(parquet_sql_attach)
5503                    parquet_file_link = database.get_sql_database_link()
5504                    # Log
5505                    log.debug(
5506                        f"Annotation '{annotation_name}' - file: "
5507                        + str(parquet_file)
5508                        + " and "
5509                        + str(parquet_hdr_file)
5510                    )
5511
5512                    # Database full header columns
5513                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
5514                        parquet_hdr_file
5515                    )
5516                    # Log
5517                    log.debug(
5518                        "Annotation database header columns : "
5519                        + str(parquet_hdr_vcf_header_columns)
5520                    )
5521
5522                    # Load header as VCF object
5523                    parquet_hdr_vcf_header_infos = database.get_header().infos
5524                    # Log
5525                    log.debug(
5526                        "Annotation database header: "
5527                        + str(parquet_hdr_vcf_header_infos)
5528                    )
5529
5530                    # Get extra infos
5531                    parquet_columns = database.get_extra_columns()
5532                    # Log
5533                    log.debug("Annotation database Columns: " + str(parquet_columns))
5534
5535                    # Add extra columns if "ALL" in annotation_fields
5536                    # if "ALL" in annotation_fields:
5537                    #     allow_add_extra_column = True
5538                    if "ALL" in annotation_fields and database.get_extra_columns():
5539                        for extra_column in database.get_extra_columns():
5540                            if (
5541                                extra_column not in annotation_fields
5542                                and extra_column.replace("INFO/", "")
5543                                not in parquet_hdr_vcf_header_infos
5544                            ):
5545                                parquet_hdr_vcf_header_infos[extra_column] = (
5546                                    vcf.parser._Info(
5547                                        extra_column,
5548                                        ".",
5549                                        "String",
5550                                        f"{extra_column} description",
5551                                        "unknown",
5552                                        "unknown",
5553                                        self.code_type_map["String"],
5554                                    )
5555                                )
5556
5557                    # For all fields in database
5558                    annotation_fields_all = False
5559                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
5560                        annotation_fields_all = True
5561                        annotation_fields = {
5562                            key: key for key in parquet_hdr_vcf_header_infos
5563                        }
5564
5565                        log.debug(
5566                            "Annotation database header - All annotations added: "
5567                            + str(annotation_fields)
5568                        )
5569
5570                    # Init
5571
5572                    # List of annotation fields to use
5573                    sql_query_annotation_update_info_sets = []
5574
5575                    # List of annotation to agregate
5576                    sql_query_annotation_to_agregate = []
5577
5578                    # Number of fields
5579                    nb_annotation_field = 0
5580
5581                    # Annotation fields processed
5582                    annotation_fields_processed = []
5583
5584                    # Columns mapping
5585                    map_columns = database.map_columns(
5586                        columns=annotation_fields, prefixes=["INFO/"]
5587                    )
5588
5589                    # Query dict for fields to remove (update option)
5590                    query_dict_remove = {}
5591
5592                    # Fetch Anotation fields
5593                    for annotation_field in annotation_fields:
5594
5595                        # annotation_field_column
5596                        annotation_field_column = map_columns.get(
5597                            annotation_field, "INFO"
5598                        )
5599
5600                        # field new name, if parametered
5601                        annotation_fields_new_name = annotation_fields.get(
5602                            annotation_field, annotation_field
5603                        )
5604                        if not annotation_fields_new_name:
5605                            annotation_fields_new_name = annotation_field
5606
5607                        # To annotate
5608                        # force_update_annotation = True
5609                        # force_append_annotation = True
5610                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
5611                        if annotation_field in parquet_hdr_vcf_header_infos and (
5612                            force_update_annotation
5613                            or force_append_annotation
5614                            or (
5615                                annotation_fields_new_name
5616                                not in self.get_header().infos
5617                            )
5618                        ):
5619
5620                            # Add field to annotation to process list
5621                            annotation_fields_processed.append(
5622                                annotation_fields_new_name
5623                            )
5624
5625                            # explode infos for the field
5626                            annotation_fields_new_name_info_msg = ""
5627                            if (
5628                                force_update_annotation
5629                                and annotation_fields_new_name
5630                                in self.get_header().infos
5631                            ):
5632                                # Remove field from INFO
5633                                query = f"""
5634                                    UPDATE {table_variants} as table_variants
5635                                    SET INFO = REGEXP_REPLACE(
5636                                                concat(table_variants.INFO,''),
5637                                                ';*{annotation_fields_new_name}=[^;]*',
5638                                                ''
5639                                                )
5640                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
5641                                """
5642                                annotation_fields_new_name_info_msg = " [update]"
5643                                query_dict_remove[
5644                                    f"remove 'INFO/{annotation_fields_new_name}'"
5645                                ] = query
5646
5647                            # Sep between fields in INFO
5648                            nb_annotation_field += 1
5649                            if nb_annotation_field > 1:
5650                                annotation_field_sep = ";"
5651                            else:
5652                                annotation_field_sep = ""
5653
5654                            log.info(
5655                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
5656                            )
5657
5658                            # Add INFO field to header
5659                            parquet_hdr_vcf_header_infos_number = (
5660                                parquet_hdr_vcf_header_infos[annotation_field].num
5661                                or "."
5662                            )
5663                            parquet_hdr_vcf_header_infos_type = (
5664                                parquet_hdr_vcf_header_infos[annotation_field].type
5665                                or "String"
5666                            )
5667                            parquet_hdr_vcf_header_infos_description = (
5668                                parquet_hdr_vcf_header_infos[annotation_field].desc
5669                                or f"{annotation_field} description"
5670                            )
5671                            parquet_hdr_vcf_header_infos_source = (
5672                                parquet_hdr_vcf_header_infos[annotation_field].source
5673                                or "unknown"
5674                            )
5675                            parquet_hdr_vcf_header_infos_version = (
5676                                parquet_hdr_vcf_header_infos[annotation_field].version
5677                                or "unknown"
5678                            )
5679
5680                            vcf_reader.infos[annotation_fields_new_name] = (
5681                                vcf.parser._Info(
5682                                    annotation_fields_new_name,
5683                                    parquet_hdr_vcf_header_infos_number,
5684                                    parquet_hdr_vcf_header_infos_type,
5685                                    parquet_hdr_vcf_header_infos_description,
5686                                    parquet_hdr_vcf_header_infos_source,
5687                                    parquet_hdr_vcf_header_infos_version,
5688                                    self.code_type_map[
5689                                        parquet_hdr_vcf_header_infos_type
5690                                    ],
5691                                )
5692                            )
5693
5694                            # Append
5695                            if force_append_annotation:
5696                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
5697                            else:
5698                                query_case_when_append = ""
5699
5700                            # Annotation/Update query fields
5701                            # Found in INFO column
5702                            if (
5703                                annotation_field_column == "INFO"
5704                                and "INFO" in parquet_hdr_vcf_header_columns
5705                            ):
5706                                sql_query_annotation_update_info_sets.append(
5707                                    f"""
5708                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
5709                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
5710                                        ELSE ''
5711                                    END
5712                                """
5713                                )
5714                            # Found in a specific column
5715                            else:
5716                                sql_query_annotation_update_info_sets.append(
5717                                    f"""
5718                                CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
5719                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ','))
5720                                        ELSE ''
5721                                    END
5722                                """
5723                                )
5724                                sql_query_annotation_to_agregate.append(
5725                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
5726                                )
5727
5728                        # Not to annotate
5729                        else:
5730
5731                            if force_update_annotation:
5732                                annotation_message = "forced"
5733                            else:
5734                                annotation_message = "skipped"
5735
5736                            if annotation_field not in parquet_hdr_vcf_header_infos:
5737                                log.warning(
5738                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
5739                                )
5740                            if annotation_fields_new_name in self.get_header().infos:
5741                                log.warning(
5742                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
5743                                )
5744
5745                    # Check if ALL fields have to be annotated. Thus concat all INFO field
5746                    # allow_annotation_full_info = True
5747                    allow_annotation_full_info = not force_append_annotation
5748
5749                    if parquet_type in ["regions"]:
5750                        allow_annotation_full_info = False
5751
5752                    if (
5753                        allow_annotation_full_info
5754                        and nb_annotation_field == len(annotation_fields)
5755                        and annotation_fields_all
5756                        and (
5757                            "INFO" in parquet_hdr_vcf_header_columns
5758                            and "INFO" in database.get_extra_columns()
5759                        )
5760                    ):
5761                        log.debug("Column INFO annotation enabled")
5762                        sql_query_annotation_update_info_sets = []
5763                        sql_query_annotation_update_info_sets.append(
5764                            f" table_parquet.INFO "
5765                        )
5766
5767                    if sql_query_annotation_update_info_sets:
5768
5769                        # Annotate
5770                        log.info(f"Annotation '{annotation_name}' - Annotation...")
5771
5772                        # Join query annotation update info sets for SQL
5773                        sql_query_annotation_update_info_sets_sql = ",".join(
5774                            sql_query_annotation_update_info_sets
5775                        )
5776
5777                        # Check chromosomes list (and variants infos)
5778                        sql_query_chromosomes = f"""
5779                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
5780                            FROM {table_variants} as table_variants
5781                            GROUP BY table_variants."#CHROM"
5782                            ORDER BY table_variants."#CHROM"
5783                            """
5784                        sql_query_chromosomes_df = self.conn.execute(
5785                            sql_query_chromosomes
5786                        ).df()
5787                        sql_query_chromosomes_dict = {
5788                            entry["CHROM"]: {
5789                                "count": entry["count_variants"],
5790                                "min": entry["min_variants"],
5791                                "max": entry["max_variants"],
5792                            }
5793                            for index, entry in sql_query_chromosomes_df.iterrows()
5794                        }
5795
5796                        # Init
5797                        nb_of_query = 0
5798                        nb_of_variant_annotated = 0
5799                        query_dict = query_dict_remove
5800
5801                        # for chrom in sql_query_chromosomes_df["CHROM"]:
5802                        for chrom in sql_query_chromosomes_dict:
5803
5804                            # Number of variant by chromosome
5805                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
5806                                chrom, {}
5807                            ).get("count", 0)
5808
5809                            log.debug(
5810                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
5811                            )
5812
5813                            # Annotation with regions database
5814                            if parquet_type in ["regions"]:
5815                                sql_query_annotation_from_clause = f"""
5816                                    FROM (
5817                                        SELECT 
5818                                            '{chrom}' AS \"#CHROM\",
5819                                            table_variants_from.\"POS\" AS \"POS\",
5820                                            {",".join(sql_query_annotation_to_agregate)}
5821                                        FROM {table_variants} as table_variants_from
5822                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
5823                                            table_parquet_from."#CHROM" = '{chrom}'
5824                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
5825                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
5826                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
5827                                                )
5828                                        )
5829                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
5830                                        GROUP BY table_variants_from.\"POS\"
5831                                        )
5832                                        as table_parquet
5833                                """
5834
5835                                sql_query_annotation_where_clause = """
5836                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
5837                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5838                                """
5839
5840                            # Annotation with variants database
5841                            else:
5842                                sql_query_annotation_from_clause = f"""
5843                                    FROM {parquet_file_link} as table_parquet
5844                                """
5845                                sql_query_annotation_where_clause = f"""
5846                                    table_variants."#CHROM" = '{chrom}'
5847                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
5848                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5849                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5850                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5851                                """
5852
5853                            # Create update query
5854                            sql_query_annotation_chrom_interval_pos = f"""
5855                                UPDATE {table_variants} as table_variants
5856                                    SET INFO = 
5857                                        concat(
5858                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5859                                                THEN table_variants.INFO
5860                                                ELSE ''
5861                                            END
5862                                            ,
5863                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5864                                                        AND (
5865                                                        concat({sql_query_annotation_update_info_sets_sql})
5866                                                        )
5867                                                        NOT IN ('','.') 
5868                                                    THEN ';'
5869                                                    ELSE ''
5870                                            END
5871                                            ,
5872                                            {sql_query_annotation_update_info_sets_sql}
5873                                            )
5874                                    {sql_query_annotation_from_clause}
5875                                    WHERE {sql_query_annotation_where_clause}
5876                                    ;
5877                                """
5878
5879                            # Add update query to dict
5880                            query_dict[
5881                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
5882                            ] = sql_query_annotation_chrom_interval_pos
5883
5884                        nb_of_query = len(query_dict)
5885                        num_query = 0
5886
5887                        # SET max_expression_depth TO x
5888                        self.conn.execute("SET max_expression_depth TO 10000")
5889
5890                        for query_name in query_dict:
5891                            query = query_dict[query_name]
5892                            num_query += 1
5893                            log.info(
5894                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
5895                            )
5896                            result = self.conn.execute(query)
5897                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
5898                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
5899                            log.info(
5900                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
5901                            )
5902
5903                        log.info(
5904                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
5905                        )
5906
5907                    else:
5908
5909                        log.info(
5910                            f"Annotation '{annotation_name}' - No Annotations available"
5911                        )
5912
5913                    log.debug("Final header: " + str(vcf_reader.infos))
5914
5915        # Remove added columns
5916        for added_column in added_columns:
5917            self.drop_column(column=added_column)

It takes a VCF file, and annotates it with a parquet file

Parameters
  • threads: number of threads to use for the annotation
Returns

the value of the variable "result".

def annotation_splice(self, threads: int = None) -> None:
5919    def annotation_splice(self, threads: int = None) -> None:
5920        """
5921        This function annotate with snpEff
5922
5923        :param threads: The number of threads to use
5924        :return: the value of the variable "return_value".
5925        """
5926
5927        # DEBUG
5928        log.debug("Start annotation with splice tools")
5929
5930        # Threads
5931        if not threads:
5932            threads = self.get_threads()
5933        log.debug("Threads: " + str(threads))
5934
5935        # DEBUG
5936        delete_tmp = True
5937        if self.get_config().get("verbosity", "warning") in ["debug"]:
5938            delete_tmp = False
5939            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5940
5941        # Config
5942        config = self.get_config()
5943        log.debug("Config: " + str(config))
5944        splice_config = config.get("tools", {}).get("splice", {})
5945        if not splice_config:
5946            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
5947        if not splice_config:
5948            msg_err = "No Splice tool config"
5949            log.error(msg_err)
5950            raise ValueError(msg_err)
5951        log.debug(f"splice_config={splice_config}")
5952
5953        # Config - Folders - Databases
5954        databases_folders = (
5955            config.get("folders", {}).get("databases", {}).get("splice", ["."])
5956        )
5957        log.debug("Databases annotations: " + str(databases_folders))
5958
5959        # Splice docker image
5960        splice_docker_image = splice_config.get("docker").get("image")
5961
5962        # Pull splice image if it's not already there
5963        if not check_docker_image_exists(splice_docker_image):
5964            log.warning(
5965                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
5966            )
5967            try:
5968                command(f"docker pull {splice_config.get('docker').get('image')}")
5969            except subprocess.CalledProcessError:
5970                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
5971                log.error(msg_err)
5972                raise ValueError(msg_err)
5973                return None
5974
5975        # Config - splice databases
5976        splice_databases = (
5977            config.get("folders", {})
5978            .get("databases", {})
5979            .get("splice", DEFAULT_SPLICE_FOLDER)
5980        )
5981        splice_databases = full_path(splice_databases)
5982
5983        # Param
5984        param = self.get_param()
5985        log.debug("Param: " + str(param))
5986
5987        # Param
5988        options = param.get("annotation", {}).get("splice", {})
5989        log.debug("Options: " + str(options))
5990
5991        # Data
5992        table_variants = self.get_table_variants()
5993
5994        # Check if not empty
5995        log.debug("Check if not empty")
5996        sql_query_chromosomes = (
5997            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5998        )
5999        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
6000            log.info("VCF empty")
6001            return None
6002
6003        # Export in VCF
6004        log.debug("Create initial file to annotate")
6005
6006        # Create output folder
6007        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
6008        if not os.path.exists(output_folder):
6009            Path(output_folder).mkdir(parents=True, exist_ok=True)
6010
6011        # Create tmp VCF file
6012        tmp_vcf = NamedTemporaryFile(
6013            prefix=self.get_prefix(),
6014            dir=output_folder,
6015            suffix=".vcf",
6016            delete=False,
6017        )
6018        tmp_vcf_name = tmp_vcf.name
6019
6020        # VCF header
6021        header = self.get_header()
6022
6023        # Existing annotations
6024        for vcf_annotation in self.get_header().infos:
6025
6026            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
6027            log.debug(
6028                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
6029            )
6030
6031        # Memory limit
6032        if config.get("memory", None):
6033            memory_limit = config.get("memory", "8G").upper()
6034            # upper()
6035        else:
6036            memory_limit = "8G"
6037        log.debug(f"memory_limit: {memory_limit}")
6038
6039        # Check number of variants to annotate
6040        where_clause_regex_spliceai = r"SpliceAI_\w+"
6041        where_clause_regex_spip = r"SPiP_\w+"
6042        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
6043        df_list_of_variants_to_annotate = self.get_query_to_df(
6044            query=f""" SELECT * FROM variants {where_clause} """
6045        )
6046        if len(df_list_of_variants_to_annotate) == 0:
6047            log.warning(
6048                f"No variants to annotate with splice. Variants probably already annotated with splice"
6049            )
6050            return None
6051        else:
6052            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
6053
6054        # Export VCF file
6055        self.export_variant_vcf(
6056            vcf_file=tmp_vcf_name,
6057            remove_info=True,
6058            add_samples=True,
6059            index=False,
6060            where_clause=where_clause,
6061        )
6062
6063        # Create docker container and launch splice analysis
6064        if splice_config:
6065
6066            # Splice mount folders
6067            mount_folders = splice_config.get("mount", {})
6068
6069            # Genome mount
6070            mount_folders[
6071                config.get("folders", {})
6072                .get("databases", {})
6073                .get("genomes", DEFAULT_GENOME_FOLDER)
6074            ] = "ro"
6075
6076            # SpliceAI mount
6077            mount_folders[
6078                config.get("folders", {})
6079                .get("databases", {})
6080                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
6081            ] = "ro"
6082
6083            # Genome mount
6084            mount_folders[
6085                config.get("folders", {})
6086                .get("databases", {})
6087                .get("spip", DEFAULT_SPIP_FOLDER)
6088            ] = "ro"
6089
6090            # Mount folders
6091            mount = []
6092
6093            # Config mount
6094            mount = [
6095                f"-v {full_path(path)}:{full_path(path)}:{mode}"
6096                for path, mode in mount_folders.items()
6097            ]
6098
6099            if any(value for value in splice_config.values() if value is None):
6100                log.warning("At least one splice config parameter is empty")
6101                return None
6102
6103            # Params in splice nf
6104            def check_values(dico: dict):
6105                """
6106                Ensure parameters for NF splice pipeline
6107                """
6108                for key, val in dico.items():
6109                    if key == "genome":
6110                        if any(
6111                            assemb in options.get("genome", {})
6112                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6113                        ):
6114                            yield f"--{key} hg19"
6115                        elif any(
6116                            assemb in options.get("genome", {})
6117                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6118                        ):
6119                            yield f"--{key} hg38"
6120                    elif (
6121                        (isinstance(val, str) and val)
6122                        or isinstance(val, int)
6123                        or isinstance(val, bool)
6124                    ):
6125                        yield f"--{key} {val}"
6126
6127            # Genome
6128            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6129            options["genome"] = genome
6130
6131            # NF params
6132            nf_params = []
6133
6134            # Add options
6135            if options:
6136                nf_params = list(check_values(options))
6137                log.debug(f"Splice NF params: {' '.join(nf_params)}")
6138            else:
6139                log.debug("No NF params provided")
6140
6141            # Add threads
6142            if "threads" not in options.keys():
6143                nf_params.append(f"--threads {threads}")
6144
6145            # Genome path
6146            genome_path = find_genome(
6147                config.get("folders", {})
6148                .get("databases", {})
6149                .get("genomes", DEFAULT_GENOME_FOLDER),
6150                file=f"{genome}.fa",
6151            )
6152            # Add genome path
6153            if not genome_path:
6154                raise ValueError(
6155                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6156                )
6157            else:
6158                log.debug(f"Genome: {genome_path}")
6159                nf_params.append(f"--genome_path {genome_path}")
6160
6161            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6162                """
6163                Setting up updated databases for SPiP and SpliceAI
6164                """
6165
6166                try:
6167
6168                    # SpliceAI assembly transcriptome
6169                    spliceai_assembly = os.path.join(
6170                        config.get("folders", {})
6171                        .get("databases", {})
6172                        .get("spliceai", {}),
6173                        options.get("genome"),
6174                        "transcriptome",
6175                    )
6176                    spip_assembly = options.get("genome")
6177
6178                    spip = find(
6179                        f"transcriptome_{spip_assembly}.RData",
6180                        config.get("folders", {}).get("databases", {}).get("spip", {}),
6181                    )
6182                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6183                    log.debug(f"SPiP annotations: {spip}")
6184                    log.debug(f"SpliceAI annotations: {spliceai}")
6185                    if spip and spliceai:
6186                        return [
6187                            f"--spip_transcriptome {spip}",
6188                            f"--spliceai_annotations {spliceai}",
6189                        ]
6190                    else:
6191                        # TODO crash and go on with basic annotations ?
6192                        # raise ValueError(
6193                        #     "Can't find splice databases in configuration EXIT"
6194                        # )
6195                        log.warning(
6196                            "Can't find splice databases in configuration, use annotations file from image"
6197                        )
6198                except TypeError:
6199                    log.warning(
6200                        "Can't find splice databases in configuration, use annotations file from image"
6201                    )
6202                    return []
6203
6204            # Add options, check if transcriptome option have already beend provided
6205            if (
6206                "spip_transcriptome" not in nf_params
6207                and "spliceai_transcriptome" not in nf_params
6208            ):
6209                splice_reference = splice_annotations(options, config)
6210                if splice_reference:
6211                    nf_params.extend(splice_reference)
6212
6213            nf_params.append(f"--output_folder {output_folder}")
6214
6215            random_uuid = f"HOWARD-SPLICE-{get_random()}"
6216            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6217            log.debug(cmd)
6218
6219            splice_config["docker"]["command"] = cmd
6220
6221            docker_cmd = get_bin_command(
6222                tool="splice",
6223                bin_type="docker",
6224                config=config,
6225                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6226                add_options=f"--name {random_uuid} {' '.join(mount)}",
6227            )
6228
6229            # Docker debug
6230            # if splice_config.get("rm_container"):
6231            #     rm_container = "--rm"
6232            # else:
6233            #     rm_container = ""
6234            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6235
6236            log.debug(docker_cmd)
6237            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6238            log.debug(res.stdout)
6239            if res.stderr:
6240                log.error(res.stderr)
6241            res.check_returncode()
6242        else:
6243            log.warning(f"Splice tool configuration not found: {config}")
6244
6245        # Update variants
6246        log.info("Annotation - Updating...")
6247        # Test find output vcf
6248        log.debug(
6249            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6250        )
6251        output_vcf = []
6252        # Wrong folder to look in
6253        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6254            if (
6255                files
6256                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6257            ):
6258                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6259        # log.debug(os.listdir(options.get("output_folder")))
6260        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6261        if not output_vcf:
6262            log.debug(
6263                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6264            )
6265        else:
6266            # Get new header from annotated vcf
6267            log.debug(f"Initial header: {len(header.infos)} fields")
6268            # Create new header with splice infos
6269            new_vcf = Variants(input=output_vcf[0])
6270            new_vcf_header = new_vcf.get_header().infos
6271            for keys, infos in new_vcf_header.items():
6272                if keys not in header.infos.keys():
6273                    header.infos[keys] = infos
6274            log.debug(f"New header: {len(header.infos)} fields")
6275            log.debug(f"Splice tmp output: {output_vcf[0]}")
6276            self.update_from_vcf(output_vcf[0])
6277
6278        # Remove folder
6279        remove_if_exists(output_folder)

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def get_config_default(self, name: str) -> dict:
6285    def get_config_default(self, name: str) -> dict:
6286        """
6287        The function `get_config_default` returns a dictionary containing default configurations for
6288        various calculations and prioritizations.
6289
6290        :param name: The `get_config_default` function returns a dictionary containing default
6291        configurations for different calculations and prioritizations. The `name` parameter is used to
6292        specify which specific configuration to retrieve from the dictionary
6293        :type name: str
6294        :return: The function `get_config_default` returns a dictionary containing default configuration
6295        settings for different calculations and prioritizations. The specific configuration settings are
6296        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6297        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6298        returned. If there is no match, an empty dictionary is returned.
6299        """
6300
6301        config_default = {
6302            "calculations": {
6303                "variant_chr_pos_alt_ref": {
6304                    "type": "sql",
6305                    "name": "variant_chr_pos_alt_ref",
6306                    "description": "Create a variant ID with chromosome, position, alt and ref",
6307                    "available": False,
6308                    "output_column_name": "variant_chr_pos_alt_ref",
6309                    "output_column_type": "String",
6310                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6311                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6312                    "operation_info": True,
6313                },
6314                "VARTYPE": {
6315                    "type": "sql",
6316                    "name": "VARTYPE",
6317                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6318                    "available": True,
6319                    "output_column_name": "VARTYPE",
6320                    "output_column_type": "String",
6321                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6322                    "operation_query": """
6323                            CASE
6324                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6325                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6326                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6327                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6328                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6329                                ELSE 'UNDEFINED'
6330                            END
6331                            """,
6332                    "info_fields": ["SVTYPE"],
6333                    "operation_info": True,
6334                },
6335                "snpeff_hgvs": {
6336                    "type": "python",
6337                    "name": "snpeff_hgvs",
6338                    "description": "HGVS nomenclatures from snpEff annotation",
6339                    "available": True,
6340                    "function_name": "calculation_extract_snpeff_hgvs",
6341                    "function_params": ["snpeff_hgvs", "ANN"],
6342                },
6343                "snpeff_ann_explode": {
6344                    "type": "python",
6345                    "name": "snpeff_ann_explode",
6346                    "description": "Explode snpEff annotations with uniquify values",
6347                    "available": True,
6348                    "function_name": "calculation_snpeff_ann_explode",
6349                    "function_params": [False, "fields", "snpeff_", "ANN"],
6350                },
6351                "snpeff_ann_explode_uniquify": {
6352                    "type": "python",
6353                    "name": "snpeff_ann_explode_uniquify",
6354                    "description": "Explode snpEff annotations",
6355                    "available": True,
6356                    "function_name": "calculation_snpeff_ann_explode",
6357                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
6358                },
6359                "snpeff_ann_explode_json": {
6360                    "type": "python",
6361                    "name": "snpeff_ann_explode_json",
6362                    "description": "Explode snpEff annotations in JSON format",
6363                    "available": True,
6364                    "function_name": "calculation_snpeff_ann_explode",
6365                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
6366                },
6367                "NOMEN": {
6368                    "type": "python",
6369                    "name": "NOMEN",
6370                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
6371                    "available": True,
6372                    "function_name": "calculation_extract_nomen",
6373                    "function_params": [],
6374                },
6375                "FINDBYPIPELINE": {
6376                    "type": "python",
6377                    "name": "FINDBYPIPELINE",
6378                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6379                    "available": True,
6380                    "function_name": "calculation_find_by_pipeline",
6381                    "function_params": ["findbypipeline"],
6382                },
6383                "FINDBYSAMPLE": {
6384                    "type": "python",
6385                    "name": "FINDBYSAMPLE",
6386                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6387                    "available": True,
6388                    "function_name": "calculation_find_by_pipeline",
6389                    "function_params": ["findbysample"],
6390                },
6391                "GENOTYPECONCORDANCE": {
6392                    "type": "python",
6393                    "name": "GENOTYPECONCORDANCE",
6394                    "description": "Concordance of genotype for multi caller VCF",
6395                    "available": True,
6396                    "function_name": "calculation_genotype_concordance",
6397                    "function_params": [],
6398                },
6399                "BARCODE": {
6400                    "type": "python",
6401                    "name": "BARCODE",
6402                    "description": "BARCODE as VaRank tool",
6403                    "available": True,
6404                    "function_name": "calculation_barcode",
6405                    "function_params": [],
6406                },
6407                "BARCODEFAMILY": {
6408                    "type": "python",
6409                    "name": "BARCODEFAMILY",
6410                    "description": "BARCODEFAMILY as VaRank tool",
6411                    "available": True,
6412                    "function_name": "calculation_barcode_family",
6413                    "function_params": ["BCF"],
6414                },
6415                "TRIO": {
6416                    "type": "python",
6417                    "name": "TRIO",
6418                    "description": "Inheritance for a trio family",
6419                    "available": True,
6420                    "function_name": "calculation_trio",
6421                    "function_params": [],
6422                },
6423                "VAF": {
6424                    "type": "python",
6425                    "name": "VAF",
6426                    "description": "Variant Allele Frequency (VAF) harmonization",
6427                    "available": True,
6428                    "function_name": "calculation_vaf_normalization",
6429                    "function_params": [],
6430                },
6431                "VAF_stats": {
6432                    "type": "python",
6433                    "name": "VAF_stats",
6434                    "description": "Variant Allele Frequency (VAF) statistics",
6435                    "available": True,
6436                    "function_name": "calculation_genotype_stats",
6437                    "function_params": ["VAF"],
6438                },
6439                "DP_stats": {
6440                    "type": "python",
6441                    "name": "DP_stats",
6442                    "description": "Depth (DP) statistics",
6443                    "available": True,
6444                    "function_name": "calculation_genotype_stats",
6445                    "function_params": ["DP"],
6446                },
6447                "variant_id": {
6448                    "type": "python",
6449                    "name": "variant_id",
6450                    "description": "Variant ID generated from variant position and type",
6451                    "available": True,
6452                    "function_name": "calculation_variant_id",
6453                    "function_params": [],
6454                },
6455                "transcripts_json": {
6456                    "type": "python",
6457                    "name": "transcripts_json",
6458                    "description": "Add transcripts info in JSON format (field 'transcripts_json')",
6459                    "available": True,
6460                    "function_name": "calculation_transcripts_json",
6461                    "function_params": ["transcripts_json"],
6462                },
6463                "transcripts_prioritization": {
6464                    "type": "python",
6465                    "name": "transcripts_prioritization",
6466                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
6467                    "available": True,
6468                    "function_name": "calculation_transcripts_prioritization",
6469                    "function_params": [],
6470                },
6471            },
6472            "prioritizations": {
6473                "default": {
6474                    "filter": [
6475                        {
6476                            "type": "notequals",
6477                            "value": "!PASS|\\.",
6478                            "score": 0,
6479                            "flag": "FILTERED",
6480                            "comment": ["Bad variant quality"],
6481                        },
6482                        {
6483                            "type": "equals",
6484                            "value": "REJECT",
6485                            "score": -20,
6486                            "flag": "PASS",
6487                            "comment": ["Bad variant quality"],
6488                        },
6489                    ],
6490                    "DP": [
6491                        {
6492                            "type": "gte",
6493                            "value": "50",
6494                            "score": 5,
6495                            "flag": "PASS",
6496                            "comment": ["DP higher than 50"],
6497                        }
6498                    ],
6499                    "ANN": [
6500                        {
6501                            "type": "contains",
6502                            "value": "HIGH",
6503                            "score": 5,
6504                            "flag": "PASS",
6505                            "comment": [
6506                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6507                            ],
6508                        },
6509                        {
6510                            "type": "contains",
6511                            "value": "MODERATE",
6512                            "score": 3,
6513                            "flag": "PASS",
6514                            "comment": [
6515                                "A non-disruptive variant that might change protein effectiveness"
6516                            ],
6517                        },
6518                        {
6519                            "type": "contains",
6520                            "value": "LOW",
6521                            "score": 0,
6522                            "flag": "FILTERED",
6523                            "comment": [
6524                                "Assumed to be mostly harmless or unlikely to change protein behavior"
6525                            ],
6526                        },
6527                        {
6528                            "type": "contains",
6529                            "value": "MODIFIER",
6530                            "score": 0,
6531                            "flag": "FILTERED",
6532                            "comment": [
6533                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
6534                            ],
6535                        },
6536                    ],
6537                }
6538            },
6539        }
6540
6541        return config_default.get(name, None)

The function get_config_default returns a dictionary containing default configurations for various calculations and prioritizations.

Parameters
  • name: The get_config_default function returns a dictionary containing default configurations for different calculations and prioritizations. The name parameter is used to specify which specific configuration to retrieve from the dictionary
Returns

The function get_config_default returns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the input name parameter provided to the function. If the name parameter matches a key in the config_default dictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.

def get_config_json(self, name: str, config_dict: dict = {}, config_file: str = None) -> dict:
6543    def get_config_json(
6544        self, name: str, config_dict: dict = {}, config_file: str = None
6545    ) -> dict:
6546        """
6547        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
6548        default values, a dictionary, and a file.
6549
6550        :param name: The `name` parameter in the `get_config_json` function is a string that represents
6551        the name of the configuration. It is used to identify and retrieve the configuration settings
6552        for a specific component or module
6553        :type name: str
6554        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
6555        dictionary that allows you to provide additional configuration settings or overrides. When you
6556        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
6557        the key is the configuration setting you want to override or
6558        :type config_dict: dict
6559        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
6560        specify the path to a configuration file that contains additional settings. If provided, the
6561        function will read the contents of this file and update the configuration dictionary with the
6562        values found in the file, overriding any existing values with the
6563        :type config_file: str
6564        :return: The function `get_config_json` returns a dictionary containing the configuration
6565        settings.
6566        """
6567
6568        # Create with default prioritizations
6569        config_default = self.get_config_default(name=name)
6570        configuration = config_default
6571        # log.debug(f"configuration={configuration}")
6572
6573        # Replace prioritizations from dict
6574        for config in config_dict:
6575            configuration[config] = config_dict[config]
6576
6577        # Replace prioritizations from file
6578        config_file = full_path(config_file)
6579        if config_file:
6580            if os.path.exists(config_file):
6581                with open(config_file) as config_file_content:
6582                    config_file_dict = json.load(config_file_content)
6583                for config in config_file_dict:
6584                    configuration[config] = config_file_dict[config]
6585            else:
6586                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
6587                log.error(msg_error)
6588                raise ValueError(msg_error)
6589
6590        return configuration

The function get_config_json retrieves a configuration JSON object with prioritizations from default values, a dictionary, and a file.

Parameters
  • name: The name parameter in the get_config_json function is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module
  • config_dict: The config_dict parameter in the get_config_json function is a dictionary that allows you to provide additional configuration settings or overrides. When you call the get_config_json function, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or
  • config_file: The config_file parameter in the get_config_json function is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns

The function get_config_json returns a dictionary containing the configuration settings.

def prioritization( self, table: str = None, pz_prefix: str = None, pz_param: dict = None) -> bool:
6592    def prioritization(
6593        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
6594    ) -> bool:
6595        """
6596        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
6597        prioritizes variants based on configured profiles and criteria.
6598
6599        :param table: The `table` parameter in the `prioritization` function is used to specify the name
6600        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
6601        a table name is provided, the method will prioritize the variants in that specific table
6602        :type table: str
6603        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
6604        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
6605        provided, the code will use a default prefix value of "PZ"
6606        :type pz_prefix: str
6607        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
6608        additional parameters specific to the prioritization process. These parameters can include
6609        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
6610        configurations needed for the prioritization of variants in a V
6611        :type pz_param: dict
6612        :return: A boolean value (True) is being returned from the `prioritization` function.
6613        """
6614
6615        # Config
6616        config = self.get_config()
6617
6618        # Param
6619        param = self.get_param()
6620
6621        # Prioritization param
6622        if pz_param is not None:
6623            prioritization_param = pz_param
6624        else:
6625            prioritization_param = param.get("prioritization", {})
6626
6627        # Configuration profiles
6628        prioritization_config_file = prioritization_param.get(
6629            "prioritization_config", None
6630        )
6631        prioritization_config_file = full_path(prioritization_config_file)
6632        prioritizations_config = self.get_config_json(
6633            name="prioritizations", config_file=prioritization_config_file
6634        )
6635
6636        # Prioritization prefix
6637        pz_prefix_default = "PZ"
6638        if pz_prefix is None:
6639            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
6640
6641        # Prioritization options
6642        profiles = prioritization_param.get("profiles", [])
6643        if isinstance(profiles, str):
6644            profiles = profiles.split(",")
6645        pzfields = prioritization_param.get(
6646            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
6647        )
6648        if isinstance(pzfields, str):
6649            pzfields = pzfields.split(",")
6650        default_profile = prioritization_param.get("default_profile", None)
6651        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
6652        prioritization_score_mode = prioritization_param.get(
6653            "prioritization_score_mode", "HOWARD"
6654        )
6655
6656        # Quick Prioritizations
6657        prioritizations = param.get("prioritizations", None)
6658        if prioritizations:
6659            log.info("Quick Prioritization:")
6660            for profile in prioritizations.split(","):
6661                if profile not in profiles:
6662                    profiles.append(profile)
6663                    log.info(f"   {profile}")
6664
6665        # If profile "ALL" provided, all profiles in the config profiles
6666        if "ALL" in profiles:
6667            profiles = list(prioritizations_config.keys())
6668
6669        for profile in profiles:
6670            if prioritizations_config.get(profile, None):
6671                log.debug(f"Profile '{profile}' configured")
6672            else:
6673                msg_error = f"Profile '{profile}' NOT configured"
6674                log.error(msg_error)
6675                raise ValueError(msg_error)
6676
6677        if profiles:
6678            log.info(f"Prioritization... ")
6679        else:
6680            log.debug(f"No profile defined")
6681            return False
6682
6683        if not default_profile and len(profiles):
6684            default_profile = profiles[0]
6685
6686        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
6687        log.debug("Profiles to check: " + str(list(profiles)))
6688
6689        # Variables
6690        if table is not None:
6691            table_variants = table
6692        else:
6693            table_variants = self.get_table_variants(clause="update")
6694        log.debug(f"Table to prioritize: {table_variants}")
6695
6696        # Added columns
6697        added_columns = []
6698
6699        # Create list of PZfields
6700        # List of PZFields
6701        list_of_pzfields_original = pzfields + [
6702            pzfield + pzfields_sep + profile
6703            for pzfield in pzfields
6704            for profile in profiles
6705        ]
6706        list_of_pzfields = []
6707        log.debug(f"{list_of_pzfields_original}")
6708
6709        # Remove existing PZfields to use if exists
6710        for pzfield in list_of_pzfields_original:
6711            if self.get_header().infos.get(pzfield, None) is None:
6712                list_of_pzfields.append(pzfield)
6713                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
6714            else:
6715                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
6716
6717        if list_of_pzfields:
6718
6719            # Explode Infos prefix
6720            explode_infos_prefix = self.get_explode_infos_prefix()
6721
6722            # PZfields tags description
6723            PZfields_INFOS = {
6724                f"{pz_prefix}Tags": {
6725                    "ID": f"{pz_prefix}Tags",
6726                    "Number": ".",
6727                    "Type": "String",
6728                    "Description": "Variant tags based on annotation criteria",
6729                },
6730                f"{pz_prefix}Score": {
6731                    "ID": f"{pz_prefix}Score",
6732                    "Number": 1,
6733                    "Type": "Integer",
6734                    "Description": "Variant score based on annotation criteria",
6735                },
6736                f"{pz_prefix}Flag": {
6737                    "ID": f"{pz_prefix}Flag",
6738                    "Number": 1,
6739                    "Type": "String",
6740                    "Description": "Variant flag based on annotation criteria",
6741                },
6742                f"{pz_prefix}Comment": {
6743                    "ID": f"{pz_prefix}Comment",
6744                    "Number": ".",
6745                    "Type": "String",
6746                    "Description": "Variant comment based on annotation criteria",
6747                },
6748                f"{pz_prefix}Infos": {
6749                    "ID": f"{pz_prefix}Infos",
6750                    "Number": ".",
6751                    "Type": "String",
6752                    "Description": "Variant infos based on annotation criteria",
6753                },
6754            }
6755
6756            # Create INFO fields if not exist
6757            for field in PZfields_INFOS:
6758                field_ID = PZfields_INFOS[field]["ID"]
6759                field_description = PZfields_INFOS[field]["Description"]
6760                if field_ID not in self.get_header().infos and field_ID in pzfields:
6761                    field_description = (
6762                        PZfields_INFOS[field]["Description"]
6763                        + f", profile {default_profile}"
6764                    )
6765                    self.get_header().infos[field_ID] = vcf.parser._Info(
6766                        field_ID,
6767                        PZfields_INFOS[field]["Number"],
6768                        PZfields_INFOS[field]["Type"],
6769                        field_description,
6770                        "unknown",
6771                        "unknown",
6772                        code_type_map[PZfields_INFOS[field]["Type"]],
6773                    )
6774
6775            # Create INFO fields if not exist for each profile
6776            for profile in prioritizations_config:
6777                if profile in profiles or profiles == []:
6778                    for field in PZfields_INFOS:
6779                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
6780                        field_description = (
6781                            PZfields_INFOS[field]["Description"]
6782                            + f", profile {profile}"
6783                        )
6784                        if (
6785                            field_ID not in self.get_header().infos
6786                            and field in pzfields
6787                        ):
6788                            self.get_header().infos[field_ID] = vcf.parser._Info(
6789                                field_ID,
6790                                PZfields_INFOS[field]["Number"],
6791                                PZfields_INFOS[field]["Type"],
6792                                field_description,
6793                                "unknown",
6794                                "unknown",
6795                                code_type_map[PZfields_INFOS[field]["Type"]],
6796                            )
6797
6798            # Header
6799            for pzfield in list_of_pzfields:
6800                if re.match(f"{pz_prefix}Score.*", pzfield):
6801                    added_column = self.add_column(
6802                        table_name=table_variants,
6803                        column_name=pzfield,
6804                        column_type="INTEGER",
6805                        default_value="0",
6806                    )
6807                elif re.match(f"{pz_prefix}Flag.*", pzfield):
6808                    added_column = self.add_column(
6809                        table_name=table_variants,
6810                        column_name=pzfield,
6811                        column_type="BOOLEAN",
6812                        default_value="1",
6813                    )
6814                else:
6815                    added_column = self.add_column(
6816                        table_name=table_variants,
6817                        column_name=pzfield,
6818                        column_type="STRING",
6819                        default_value="''",
6820                    )
6821                added_columns.append(added_column)
6822
6823            # Profiles
6824            if profiles:
6825
6826                # foreach profile in configuration file
6827                for profile in prioritizations_config:
6828
6829                    # If profile is asked in param, or ALL are asked (empty profile [])
6830                    if profile in profiles or profiles == []:
6831                        log.info(f"Profile '{profile}'")
6832
6833                        sql_set_info_option = ""
6834
6835                        sql_set_info = []
6836
6837                        # PZ fields set
6838
6839                        # PZScore
6840                        if (
6841                            f"{pz_prefix}Score{pzfields_sep}{profile}"
6842                            in list_of_pzfields
6843                        ):
6844                            sql_set_info.append(
6845                                f"""
6846                                    concat(
6847                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
6848                                        {pz_prefix}Score{pzfields_sep}{profile}
6849                                    ) 
6850                                """
6851                            )
6852                            if (
6853                                profile == default_profile
6854                                and f"{pz_prefix}Score" in list_of_pzfields
6855                            ):
6856                                sql_set_info.append(
6857                                    f"""
6858                                        concat(
6859                                            '{pz_prefix}Score=',
6860                                            {pz_prefix}Score{pzfields_sep}{profile}
6861                                        )
6862                                    """
6863                                )
6864
6865                        # PZFlag
6866                        if (
6867                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
6868                            in list_of_pzfields
6869                        ):
6870                            sql_set_info.append(
6871                                f"""
6872                                    concat(
6873                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
6874                                        CASE 
6875                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
6876                                            THEN 'PASS'
6877                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
6878                                            THEN 'FILTERED'
6879                                        END
6880                                    ) 
6881                                """
6882                            )
6883                            if (
6884                                profile == default_profile
6885                                and f"{pz_prefix}Flag" in list_of_pzfields
6886                            ):
6887                                sql_set_info.append(
6888                                    f"""
6889                                        concat(
6890                                            '{pz_prefix}Flag=',
6891                                            CASE 
6892                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
6893                                                THEN 'PASS'
6894                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
6895                                                THEN 'FILTERED'
6896                                            END
6897                                        )
6898                                    """
6899                                )
6900
6901                        # PZComment
6902                        if (
6903                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
6904                            in list_of_pzfields
6905                        ):
6906                            sql_set_info.append(
6907                                f"""
6908                                    CASE
6909                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
6910                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
6911                                        ELSE ''
6912                                    END
6913                                """
6914                            )
6915                            if (
6916                                profile == default_profile
6917                                and f"{pz_prefix}Comment" in list_of_pzfields
6918                            ):
6919                                sql_set_info.append(
6920                                    f"""
6921                                        CASE
6922                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
6923                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
6924                                            ELSE ''
6925                                        END
6926                                    """
6927                                )
6928
6929                        # PZInfos
6930                        if (
6931                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
6932                            in list_of_pzfields
6933                        ):
6934                            sql_set_info.append(
6935                                f"""
6936                                    CASE
6937                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
6938                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
6939                                        ELSE ''
6940                                    END
6941                                """
6942                            )
6943                            if (
6944                                profile == default_profile
6945                                and f"{pz_prefix}Infos" in list_of_pzfields
6946                            ):
6947                                sql_set_info.append(
6948                                    f"""
6949                                        CASE
6950                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
6951                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
6952                                            ELSE ''
6953                                        END
6954                                    """
6955                                )
6956
6957                        # Merge PZfields
6958                        sql_set_info_option = ""
6959                        sql_set_sep = ""
6960                        for sql_set in sql_set_info:
6961                            if sql_set_sep:
6962                                sql_set_info_option += f"""
6963                                    , concat('{sql_set_sep}', {sql_set})
6964                                """
6965                            else:
6966                                sql_set_info_option += f"""
6967                                    , {sql_set}
6968                                """
6969                            sql_set_sep = ";"
6970
6971                        sql_queries = []
6972                        for annotation in prioritizations_config[profile]:
6973
6974                            # Explode specific annotation
6975                            log.debug(f"Explode annotation '{annotation}'")
6976                            added_columns += self.explode_infos(
6977                                prefix=explode_infos_prefix,
6978                                fields=[annotation],
6979                                table=table_variants,
6980                            )
6981                            extra_infos = self.get_extra_infos(table=table_variants)
6982
6983                            # Check if annotation field is present
6984                            if not f"{explode_infos_prefix}{annotation}" in extra_infos:
6985                                log.debug(f"Annotation '{annotation}' not in data")
6986                                continue
6987                            else:
6988                                log.debug(f"Annotation '{annotation}' in data")
6989
6990                            # For each criterions
6991                            for criterion in prioritizations_config[profile][
6992                                annotation
6993                            ]:
6994                                criterion_type = criterion["type"]
6995                                criterion_value = criterion["value"]
6996                                criterion_score = criterion.get("score", 0)
6997                                criterion_flag = criterion.get("flag", "PASS")
6998                                criterion_flag_bool = criterion_flag == "PASS"
6999                                criterion_comment = (
7000                                    ", ".join(criterion.get("comment", []))
7001                                    .replace("'", "''")
7002                                    .replace(";", ",")
7003                                    .replace("\t", " ")
7004                                )
7005                                criterion_infos = (
7006                                    str(criterion)
7007                                    .replace("'", "''")
7008                                    .replace(";", ",")
7009                                    .replace("\t", " ")
7010                                )
7011
7012                                sql_set = []
7013                                sql_set_info = []
7014
7015                                # PZ fields set
7016                                if (
7017                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
7018                                    in list_of_pzfields
7019                                ):
7020                                    if prioritization_score_mode == "HOWARD":
7021                                        sql_set.append(
7022                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7023                                        )
7024                                    elif prioritization_score_mode == "VaRank":
7025                                        sql_set.append(
7026                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
7027                                        )
7028                                    else:
7029                                        sql_set.append(
7030                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7031                                        )
7032                                if (
7033                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
7034                                    in list_of_pzfields
7035                                ):
7036                                    sql_set.append(
7037                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
7038                                    )
7039                                if (
7040                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
7041                                    in list_of_pzfields
7042                                ):
7043                                    sql_set.append(
7044                                        f"""
7045                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
7046                                                concat(
7047                                                    {pz_prefix}Comment{pzfields_sep}{profile},
7048                                                    CASE 
7049                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
7050                                                        THEN ', '
7051                                                        ELSE ''
7052                                                    END,
7053                                                    '{criterion_comment}'
7054                                                )
7055                                        """
7056                                    )
7057                                if (
7058                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
7059                                    in list_of_pzfields
7060                                ):
7061                                    sql_set.append(
7062                                        f"""
7063                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
7064                                                concat(
7065                                                    {pz_prefix}Infos{pzfields_sep}{profile},
7066                                                    '{criterion_infos}'
7067                                                )
7068                                        """
7069                                    )
7070                                sql_set_option = ",".join(sql_set)
7071
7072                                # Criterion and comparison
7073                                if sql_set_option:
7074                                    try:
7075                                        float(criterion_value)
7076                                        sql_update = f"""
7077                                            UPDATE {table_variants}
7078                                            SET {sql_set_option}
7079                                            WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
7080                                            AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
7081                                            """
7082                                    except:
7083                                        contains_option = ""
7084                                        if criterion_type == "contains":
7085                                            contains_option = ".*"
7086                                        sql_update = f"""
7087                                            UPDATE {table_variants}
7088                                            SET {sql_set_option}
7089                                            WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
7090                                            """
7091                                    sql_queries.append(sql_update)
7092                                else:
7093                                    log.warning(
7094                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
7095                                    )
7096
7097                        # PZTags
7098                        if (
7099                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
7100                            in list_of_pzfields
7101                        ):
7102
7103                            # Create PZFalgs value
7104                            pztags_value = ""
7105                            pztags_sep_default = "|"
7106                            pztags_sep = ""
7107                            for pzfield in pzfields:
7108                                if pzfield not in [f"{pz_prefix}Tags"]:
7109                                    if (
7110                                        f"{pzfield}{pzfields_sep}{profile}"
7111                                        in list_of_pzfields
7112                                    ):
7113                                        if pzfield in [f"{pz_prefix}Flag"]:
7114                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7115                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
7116                                                    THEN 'PASS'
7117                                                    ELSE 'FILTERED'
7118                                                END, '"""
7119                                        else:
7120                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
7121                                        pztags_sep = pztags_sep_default
7122
7123                            # Add Query update for PZFlags
7124                            sql_update_pztags = f"""
7125                                UPDATE {table_variants}
7126                                SET INFO = concat(
7127                                        INFO,
7128                                        CASE WHEN INFO NOT in ('','.')
7129                                                THEN ';'
7130                                                ELSE ''
7131                                        END,
7132                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
7133                                    )
7134                                """
7135                            sql_queries.append(sql_update_pztags)
7136
7137                            # Add Query update for PZFlags for default
7138                            if profile == default_profile:
7139                                sql_update_pztags_default = f"""
7140                                UPDATE {table_variants}
7141                                SET INFO = concat(
7142                                        INFO,
7143                                        ';',
7144                                        '{pz_prefix}Tags={pztags_value}'
7145                                    )
7146                                """
7147                                sql_queries.append(sql_update_pztags_default)
7148
7149                        log.info(f"""Profile '{profile}' - Prioritization... """)
7150
7151                        if sql_queries:
7152
7153                            for sql_query in sql_queries:
7154                                log.debug(
7155                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
7156                                )
7157                                self.conn.execute(sql_query)
7158
7159                        log.info(f"""Profile '{profile}' - Update... """)
7160                        sql_query_update = f"""
7161                            UPDATE {table_variants}
7162                            SET INFO =  
7163                                concat(
7164                                    CASE
7165                                        WHEN INFO NOT IN ('','.')
7166                                        THEN concat(INFO, ';')
7167                                        ELSE ''
7168                                    END
7169                                    {sql_set_info_option}
7170                                )
7171                        """
7172                        self.conn.execute(sql_query_update)
7173
7174        else:
7175
7176            log.warning(f"No profiles in parameters")
7177
7178        # Remove added columns
7179        for added_column in added_columns:
7180            self.drop_column(column=added_column)
7181
7182        # Explode INFOS fields into table fields
7183        if self.get_explode_infos():
7184            self.explode_infos(
7185                prefix=self.get_explode_infos_prefix(),
7186                fields=self.get_explode_infos_fields(),
7187                force=True,
7188            )
7189
7190        return True

The prioritization function in Python processes VCF files, adds new INFO fields, and prioritizes variants based on configured profiles and criteria.

Parameters
  • table: The table parameter in the prioritization function is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table
  • pz_prefix: The pz_prefix parameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ"
  • pz_param: The pz_param parameter in the prioritization method is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns

A boolean value (True) is being returned from the prioritization function.

def annotation_hgvs(self, threads: int = None) -> None:
7196    def annotation_hgvs(self, threads: int = None) -> None:
7197        """
7198        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7199        coordinates and alleles.
7200
7201        :param threads: The `threads` parameter is an optional integer that specifies the number of
7202        threads to use for parallel processing. If no value is provided, it will default to the number
7203        of threads obtained from the `get_threads()` method
7204        :type threads: int
7205        """
7206
7207        # Function for each partition of the Dask Dataframe
7208        def partition_function(partition):
7209            """
7210            The function `partition_function` applies the `annotation_hgvs_partition` function to
7211            each row of a DataFrame called `partition`.
7212
7213            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7214            to be processed
7215            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7216            the "partition" dataframe along the axis 1.
7217            """
7218            return partition.apply(annotation_hgvs_partition, axis=1)
7219
7220        def annotation_hgvs_partition(row) -> str:
7221            """
7222            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7223            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7224
7225            :param row: A dictionary-like object that contains the values for the following keys:
7226            :return: a string that contains the HGVS names associated with the given row of data.
7227            """
7228
7229            chr = row["CHROM"]
7230            pos = row["POS"]
7231            ref = row["REF"]
7232            alt = row["ALT"]
7233
7234            # Find list of associated transcripts
7235            transcripts_list = list(
7236                polars_conn.execute(
7237                    f"""
7238                SELECT transcript
7239                FROM refseq_df
7240                WHERE CHROM='{chr}'
7241                AND POS={pos}
7242            """
7243                )["transcript"]
7244            )
7245
7246            # Full HGVS annotation in list
7247            hgvs_full_list = []
7248
7249            for transcript_name in transcripts_list:
7250
7251                # Transcript
7252                transcript = get_transcript(
7253                    transcripts=transcripts, transcript_name=transcript_name
7254                )
7255                # Exon
7256                if use_exon:
7257                    exon = transcript.find_exon_number(pos)
7258                else:
7259                    exon = None
7260                # Protein
7261                transcript_protein = None
7262                if use_protein or add_protein or full_format:
7263                    transcripts_protein = list(
7264                        polars_conn.execute(
7265                            f"""
7266                        SELECT protein
7267                        FROM refseqlink_df
7268                        WHERE transcript='{transcript_name}'
7269                        LIMIT 1
7270                    """
7271                        )["protein"]
7272                    )
7273                    if len(transcripts_protein):
7274                        transcript_protein = transcripts_protein[0]
7275
7276                # HGVS name
7277                hgvs_name = format_hgvs_name(
7278                    chr,
7279                    pos,
7280                    ref,
7281                    alt,
7282                    genome=genome,
7283                    transcript=transcript,
7284                    transcript_protein=transcript_protein,
7285                    exon=exon,
7286                    use_gene=use_gene,
7287                    use_protein=use_protein,
7288                    full_format=full_format,
7289                    use_version=use_version,
7290                    codon_type=codon_type,
7291                )
7292                hgvs_full_list.append(hgvs_name)
7293                if add_protein and not use_protein and not full_format:
7294                    hgvs_name = format_hgvs_name(
7295                        chr,
7296                        pos,
7297                        ref,
7298                        alt,
7299                        genome=genome,
7300                        transcript=transcript,
7301                        transcript_protein=transcript_protein,
7302                        exon=exon,
7303                        use_gene=use_gene,
7304                        use_protein=True,
7305                        full_format=False,
7306                        use_version=use_version,
7307                        codon_type=codon_type,
7308                    )
7309                    hgvs_full_list.append(hgvs_name)
7310
7311            # Create liste of HGVS annotations
7312            hgvs_full = ",".join(hgvs_full_list)
7313
7314            return hgvs_full
7315
7316        # Polars connexion
7317        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7318
7319        # Config
7320        config = self.get_config()
7321
7322        # Databases
7323        # Genome
7324        databases_genomes_folders = (
7325            config.get("folders", {})
7326            .get("databases", {})
7327            .get("genomes", DEFAULT_GENOME_FOLDER)
7328        )
7329        databases_genome = (
7330            config.get("folders", {}).get("databases", {}).get("genomes", "")
7331        )
7332        # refseq database folder
7333        databases_refseq_folders = (
7334            config.get("folders", {})
7335            .get("databases", {})
7336            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7337        )
7338        # refseq
7339        databases_refseq = config.get("databases", {}).get("refSeq", None)
7340        # refSeqLink
7341        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7342
7343        # Param
7344        param = self.get_param()
7345
7346        # Quick HGVS
7347        if "hgvs_options" in param and param.get("hgvs_options", ""):
7348            log.info(f"Quick HGVS Annotation:")
7349            if not param.get("hgvs", None):
7350                param["hgvs"] = {}
7351            for option in param.get("hgvs_options", "").split(","):
7352                option_var_val = option.split("=")
7353                option_var = option_var_val[0]
7354                if len(option_var_val) > 1:
7355                    option_val = option_var_val[1]
7356                else:
7357                    option_val = "True"
7358                if option_val.upper() in ["TRUE"]:
7359                    option_val = True
7360                elif option_val.upper() in ["FALSE"]:
7361                    option_val = False
7362                log.info(f"   {option_var}={option_val}")
7363                param["hgvs"][option_var] = option_val
7364
7365        # Check if HGVS annotation enabled
7366        if "hgvs" in param:
7367            log.info(f"HGVS Annotation... ")
7368            for hgvs_option in param.get("hgvs", {}):
7369                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7370        else:
7371            return
7372
7373        # HGVS Param
7374        param_hgvs = param.get("hgvs", {})
7375        use_exon = param_hgvs.get("use_exon", False)
7376        use_gene = param_hgvs.get("use_gene", False)
7377        use_protein = param_hgvs.get("use_protein", False)
7378        add_protein = param_hgvs.get("add_protein", False)
7379        full_format = param_hgvs.get("full_format", False)
7380        use_version = param_hgvs.get("use_version", False)
7381        codon_type = param_hgvs.get("codon_type", "3")
7382
7383        # refSseq refSeqLink
7384        databases_refseq = param_hgvs.get("refseq", databases_refseq)
7385        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
7386
7387        # Assembly
7388        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
7389
7390        # Genome
7391        genome_file = None
7392        if find_genome(databases_genome):
7393            genome_file = find_genome(databases_genome)
7394        else:
7395            genome_file = find_genome(
7396                genome_path=databases_genomes_folders, assembly=assembly
7397            )
7398        log.debug("Genome: " + str(genome_file))
7399
7400        # refSseq
7401        refseq_file = find_file_prefix(
7402            input_file=databases_refseq,
7403            prefix="ncbiRefSeq",
7404            folder=databases_refseq_folders,
7405            assembly=assembly,
7406        )
7407        log.debug("refSeq: " + str(refseq_file))
7408
7409        # refSeqLink
7410        refseqlink_file = find_file_prefix(
7411            input_file=databases_refseqlink,
7412            prefix="ncbiRefSeqLink",
7413            folder=databases_refseq_folders,
7414            assembly=assembly,
7415        )
7416        log.debug("refSeqLink: " + str(refseqlink_file))
7417
7418        # Threads
7419        if not threads:
7420            threads = self.get_threads()
7421        log.debug("Threads: " + str(threads))
7422
7423        # Variables
7424        table_variants = self.get_table_variants(clause="update")
7425
7426        # Get variants SNV and InDel only
7427        query_variants = f"""
7428            SELECT "#CHROM" AS CHROM, POS, REF, ALT
7429            FROM {table_variants}
7430            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
7431            """
7432        df_variants = self.get_query_to_df(query_variants)
7433
7434        # Added columns
7435        added_columns = []
7436
7437        # Add hgvs column in variants table
7438        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
7439        added_column = self.add_column(
7440            table_variants, hgvs_column_name, "STRING", default_value=None
7441        )
7442        added_columns.append(added_column)
7443
7444        log.debug(f"refSeq loading...")
7445        # refSeq in duckDB
7446        refseq_table = get_refseq_table(
7447            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
7448        )
7449        # Loading all refSeq in Dataframe
7450        refseq_query = f"""
7451            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
7452            FROM {refseq_table}
7453            JOIN df_variants ON (
7454                {refseq_table}.chrom = df_variants.CHROM
7455                AND {refseq_table}.txStart<=df_variants.POS
7456                AND {refseq_table}.txEnd>=df_variants.POS
7457            )
7458        """
7459        refseq_df = self.conn.query(refseq_query).pl()
7460
7461        if refseqlink_file:
7462            log.debug(f"refSeqLink loading...")
7463            # refSeqLink in duckDB
7464            refseqlink_table = get_refseq_table(
7465                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
7466            )
7467            # Loading all refSeqLink in Dataframe
7468            protacc_column = "protAcc_with_ver"
7469            mrnaacc_column = "mrnaAcc_with_ver"
7470            refseqlink_query = f"""
7471                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
7472                FROM {refseqlink_table} 
7473                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
7474                WHERE protAcc_without_ver IS NOT NULL
7475            """
7476            # Polars Dataframe
7477            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
7478
7479        # Read RefSeq transcripts into a python dict/model.
7480        log.debug(f"Transcripts loading...")
7481        with tempfile.TemporaryDirectory() as tmpdir:
7482            transcripts_query = f"""
7483                COPY (
7484                    SELECT {refseq_table}.*
7485                    FROM {refseq_table}
7486                    JOIN df_variants ON (
7487                        {refseq_table}.chrom=df_variants.CHROM
7488                        AND {refseq_table}.txStart<=df_variants.POS
7489                        AND {refseq_table}.txEnd>=df_variants.POS
7490                    )
7491                )
7492                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
7493            """
7494            self.conn.query(transcripts_query)
7495            with open(f"{tmpdir}/transcript.tsv") as infile:
7496                transcripts = read_transcripts(infile)
7497
7498        # Polars connexion
7499        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7500
7501        log.debug("Genome loading...")
7502        # Read genome sequence using pyfaidx.
7503        genome = Fasta(genome_file)
7504
7505        log.debug("Start annotation HGVS...")
7506
7507        # Create
7508        # a Dask Dataframe from Pandas dataframe with partition as number of threads
7509        ddf = dd.from_pandas(df_variants, npartitions=threads)
7510
7511        # Use dask.dataframe.apply() to apply function on each partition
7512        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
7513
7514        # Convert Dask DataFrame to Pandas Dataframe
7515        df = ddf.compute()
7516
7517        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
7518        with tempfile.TemporaryDirectory() as tmpdir:
7519            df_parquet = os.path.join(tmpdir, "df.parquet")
7520            df.to_parquet(df_parquet)
7521
7522            # Update hgvs column
7523            update_variant_query = f"""
7524                UPDATE {table_variants}
7525                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
7526                FROM read_parquet('{df_parquet}') as df
7527                WHERE variants."#CHROM" = df.CHROM
7528                AND variants.POS = df.POS
7529                AND variants.REF = df.REF
7530                AND variants.ALT = df.ALT
7531                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
7532                """
7533            self.execute_query(update_variant_query)
7534
7535        # Update INFO column
7536        sql_query_update = f"""
7537            UPDATE {table_variants}
7538            SET INFO = 
7539                concat(
7540                    CASE 
7541                        WHEN INFO NOT IN ('','.')
7542                        THEN concat(INFO, ';')
7543                        ELSE ''
7544                    END,
7545                    'hgvs=',
7546                    {hgvs_column_name}
7547                )
7548            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
7549            """
7550        self.execute_query(sql_query_update)
7551
7552        # Add header
7553        HGVS_INFOS = {
7554            "hgvs": {
7555                "ID": "hgvs",
7556                "Number": ".",
7557                "Type": "String",
7558                "Description": f"HGVS annotatation with HOWARD",
7559            }
7560        }
7561
7562        for field in HGVS_INFOS:
7563            field_ID = HGVS_INFOS[field]["ID"]
7564            field_description = HGVS_INFOS[field]["Description"]
7565            self.get_header().infos[field_ID] = vcf.parser._Info(
7566                field_ID,
7567                HGVS_INFOS[field]["Number"],
7568                HGVS_INFOS[field]["Type"],
7569                field_description,
7570                "unknown",
7571                "unknown",
7572                code_type_map[HGVS_INFOS[field]["Type"]],
7573            )
7574
7575        # Remove added columns
7576        for added_column in added_columns:
7577            self.drop_column(column=added_column)

The annotation_hgvs function performs HGVS annotation on a set of variants using genomic coordinates and alleles.

Parameters
  • threads: The threads parameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from the get_threads() method
def get_operations_help( self, operations_config_dict: dict = {}, operations_config_file: str = None) -> list:
7583    def get_operations_help(
7584        self, operations_config_dict: dict = {}, operations_config_file: str = None
7585    ) -> list:
7586
7587        # Init
7588        operations_help = []
7589
7590        # operations
7591        operations = self.get_config_json(
7592            name="calculations",
7593            config_dict=operations_config_dict,
7594            config_file=operations_config_file,
7595        )
7596        for op in operations:
7597            op_name = operations[op].get("name", op).upper()
7598            op_description = operations[op].get("description", op_name)
7599            op_available = operations[op].get("available", False)
7600            if op_available:
7601                operations_help.append(f"   {op_name}: {op_description}")
7602
7603        # Sort operations
7604        operations_help.sort()
7605
7606        # insert header
7607        operations_help.insert(0, "Available calculation operations:")
7608
7609        # Return
7610        return operations_help
def calculation( self, operations: dict = {}, operations_config_dict: dict = {}, operations_config_file: str = None) -> None:
7612    def calculation(
7613        self,
7614        operations: dict = {},
7615        operations_config_dict: dict = {},
7616        operations_config_file: str = None,
7617    ) -> None:
7618        """
7619        It takes a list of operations, and for each operation, it checks if it's a python or sql
7620        operation, and then calls the appropriate function
7621
7622        param json example:
7623            "calculation": {
7624                "NOMEN": {
7625                    "options": {
7626                        "hgvs_field": "hgvs"
7627                    },
7628                "middle" : null
7629            }
7630        """
7631
7632        # Param
7633        param = self.get_param()
7634
7635        # operations config
7636        operations_config = self.get_config_json(
7637            name="calculations",
7638            config_dict=operations_config_dict,
7639            config_file=operations_config_file,
7640        )
7641
7642        # Upper keys
7643        operations_config = {k.upper(): v for k, v in operations_config.items()}
7644
7645        # Calculations
7646
7647        # Operations from param
7648        operations = param.get("calculation", {}).get("calculations", operations)
7649
7650        # Quick calculation - add
7651        if param.get("calculations", None):
7652            calculations_list = [
7653                value for value in param.get("calculations", "").split(",")
7654            ]
7655            log.info(f"Quick Calculations:")
7656            for calculation_key in calculations_list:
7657                log.info(f"   {calculation_key}")
7658            for calculation_operation in calculations_list:
7659                if calculation_operation.upper() not in operations:
7660                    operations[calculation_operation.upper()] = {}
7661                    add_value_into_dict(
7662                        dict_tree=param,
7663                        sections=[
7664                            "calculation",
7665                            "calculations",
7666                            calculation_operation.upper(),
7667                        ],
7668                        value={},
7669                    )
7670
7671        # Operations for calculation
7672        if not operations:
7673            operations = param.get("calculation", {}).get("calculations", {})
7674
7675        if operations:
7676            log.info(f"Calculations...")
7677
7678        # For each operations
7679        for operation_name in operations:
7680            operation_name = operation_name.upper()
7681            if operation_name not in [""]:
7682                if operation_name in operations_config:
7683                    log.info(f"Calculation '{operation_name}'")
7684                    operation = operations_config[operation_name]
7685                    operation_type = operation.get("type", "sql")
7686                    if operation_type == "python":
7687                        self.calculation_process_function(
7688                            operation=operation, operation_name=operation_name
7689                        )
7690                    elif operation_type == "sql":
7691                        self.calculation_process_sql(
7692                            operation=operation, operation_name=operation_name
7693                        )
7694                    else:
7695                        log.error(
7696                            f"Operations config: Type '{operation_type}' NOT available"
7697                        )
7698                        raise ValueError(
7699                            f"Operations config: Type '{operation_type}' NOT available"
7700                        )
7701                else:
7702                    log.error(
7703                        f"Operations config: Calculation '{operation_name}' NOT available"
7704                    )
7705                    raise ValueError(
7706                        f"Operations config: Calculation '{operation_name}' NOT available"
7707                    )
7708
7709        # Explode INFOS fields into table fields
7710        if self.get_explode_infos():
7711            self.explode_infos(
7712                prefix=self.get_explode_infos_prefix(),
7713                fields=self.get_explode_infos_fields(),
7714                force=True,
7715            )

It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function

param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }

def calculation_process_sql(self, operation: dict, operation_name: str = 'unknown') -> None:
7717    def calculation_process_sql(
7718        self, operation: dict, operation_name: str = "unknown"
7719    ) -> None:
7720        """
7721        The `calculation_process_sql` function takes in a mathematical operation as a string and
7722        performs the operation, updating the specified table with the result.
7723
7724        :param operation: The `operation` parameter is a dictionary that contains information about the
7725        mathematical operation to be performed. It includes the following keys:
7726        :type operation: dict
7727        :param operation_name: The `operation_name` parameter is a string that represents the name of
7728        the mathematical operation being performed. It is used for logging and error handling purposes,
7729        defaults to unknown
7730        :type operation_name: str (optional)
7731        """
7732
7733        # table variants
7734        table_variants = self.get_table_variants(clause="alter")
7735
7736        # Operation infos
7737        operation_name = operation.get("name", "unknown")
7738        log.debug(f"process sql {operation_name}")
7739        output_column_name = operation.get("output_column_name", operation_name)
7740        output_column_type = operation.get("output_column_type", "String")
7741        prefix = operation.get("explode_infos_prefix", "")
7742        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
7743        output_column_description = operation.get(
7744            "output_column_description", f"{operation_name} operation"
7745        )
7746        operation_query = operation.get("operation_query", None)
7747        if isinstance(operation_query, list):
7748            operation_query = " ".join(operation_query)
7749        operation_info_fields = operation.get("info_fields", [])
7750        operation_info_fields_check = operation.get("info_fields_check", False)
7751        operation_info = operation.get("operation_info", True)
7752
7753        if operation_query:
7754
7755            # Info fields check
7756            operation_info_fields_check_result = True
7757            if operation_info_fields_check:
7758                header_infos = self.get_header().infos
7759                for info_field in operation_info_fields:
7760                    operation_info_fields_check_result = (
7761                        operation_info_fields_check_result
7762                        and info_field in header_infos
7763                    )
7764
7765            # If info fields available
7766            if operation_info_fields_check_result:
7767
7768                # Added_columns
7769                added_columns = []
7770
7771                # Create VCF header field
7772                vcf_reader = self.get_header()
7773                vcf_reader.infos[output_column_name] = vcf.parser._Info(
7774                    output_column_name,
7775                    ".",
7776                    output_column_type,
7777                    output_column_description,
7778                    "howard calculation",
7779                    "0",
7780                    self.code_type_map.get(output_column_type),
7781                )
7782
7783                # Explode infos if needed
7784                log.debug(f"calculation_process_sql prefix {prefix}")
7785                added_columns += self.explode_infos(
7786                    prefix=prefix,
7787                    fields=[output_column_name] + operation_info_fields,
7788                    force=True,
7789                )
7790
7791                # Create column
7792                added_column = self.add_column(
7793                    table_name=table_variants,
7794                    column_name=prefix + output_column_name,
7795                    column_type=output_column_type_sql,
7796                    default_value="null",
7797                )
7798                added_columns.append(added_column)
7799
7800                # Operation calculation
7801                try:
7802
7803                    # Query to update calculation column
7804                    sql_update = f"""
7805                        UPDATE {table_variants}
7806                        SET "{prefix}{output_column_name}" = ({operation_query})
7807                    """
7808                    self.conn.execute(sql_update)
7809
7810                    # Add to INFO
7811                    if operation_info:
7812                        sql_update_info = f"""
7813                            UPDATE {table_variants}
7814                            SET "INFO" =
7815                                concat(
7816                                    CASE
7817                                        WHEN "INFO" IS NOT NULL
7818                                        THEN concat("INFO", ';')
7819                                        ELSE ''
7820                                    END,
7821                                    '{output_column_name}=',
7822                                    "{prefix}{output_column_name}"
7823                                )
7824                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
7825                        """
7826                        self.conn.execute(sql_update_info)
7827
7828                except:
7829                    log.error(
7830                        f"Operations config: Calculation '{operation_name}' query failed"
7831                    )
7832                    raise ValueError(
7833                        f"Operations config: Calculation '{operation_name}' query failed"
7834                    )
7835
7836                # Remove added columns
7837                for added_column in added_columns:
7838                    log.debug(f"added_column: {added_column}")
7839                    self.drop_column(column=added_column)
7840
7841            else:
7842                log.error(
7843                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7844                )
7845                raise ValueError(
7846                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7847                )
7848
7849        else:
7850            log.error(
7851                f"Operations config: Calculation '{operation_name}' query NOT defined"
7852            )
7853            raise ValueError(
7854                f"Operations config: Calculation '{operation_name}' query NOT defined"
7855            )

The calculation_process_sql function takes in a mathematical operation as a string and performs the operation, updating the specified table with the result.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
def calculation_process_function(self, operation: dict, operation_name: str = 'unknown') -> None:
7857    def calculation_process_function(
7858        self, operation: dict, operation_name: str = "unknown"
7859    ) -> None:
7860        """
7861        The `calculation_process_function` takes in an operation dictionary and performs the specified
7862        function with the given parameters.
7863
7864        :param operation: The `operation` parameter is a dictionary that contains information about the
7865        operation to be performed. It has the following keys:
7866        :type operation: dict
7867        :param operation_name: The `operation_name` parameter is a string that represents the name of
7868        the operation being performed. It is used for logging purposes, defaults to unknown
7869        :type operation_name: str (optional)
7870        """
7871
7872        operation_name = operation["name"]
7873        log.debug(f"process sql {operation_name}")
7874        function_name = operation["function_name"]
7875        function_params = operation["function_params"]
7876        getattr(self, function_name)(*function_params)

The calculation_process_function takes in an operation dictionary and performs the specified function with the given parameters.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the operation to be performed. It has the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
def calculation_variant_id(self) -> None:
7878    def calculation_variant_id(self) -> None:
7879        """
7880        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
7881        updates the INFO field of a variants table with the variant ID.
7882        """
7883
7884        # variant_id annotation field
7885        variant_id_tag = self.get_variant_id_column()
7886        added_columns = [variant_id_tag]
7887
7888        # variant_id hgvs tags"
7889        vcf_infos_tags = {
7890            variant_id_tag: "howard variant ID annotation",
7891        }
7892
7893        # Variants table
7894        table_variants = self.get_table_variants()
7895
7896        # Header
7897        vcf_reader = self.get_header()
7898
7899        # Add variant_id to header
7900        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
7901            variant_id_tag,
7902            ".",
7903            "String",
7904            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
7905            "howard calculation",
7906            "0",
7907            self.code_type_map.get("String"),
7908        )
7909
7910        # Update
7911        sql_update = f"""
7912            UPDATE {table_variants}
7913            SET "INFO" = 
7914                concat(
7915                    CASE
7916                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7917                        THEN ''
7918                        ELSE concat("INFO", ';')
7919                    END,
7920                    '{variant_id_tag}=',
7921                    "{variant_id_tag}"
7922                )
7923        """
7924        self.conn.execute(sql_update)
7925
7926        # Remove added columns
7927        for added_column in added_columns:
7928            self.drop_column(column=added_column)

The function calculation_variant_id adds a variant ID annotation to a VCF file header and updates the INFO field of a variants table with the variant ID.

def calculation_extract_snpeff_hgvs( self, snpeff_hgvs: str = 'snpeff_hgvs', snpeff_field: str = 'ANN') -> None:
7930    def calculation_extract_snpeff_hgvs(
7931        self,
7932        snpeff_hgvs: str = "snpeff_hgvs",
7933        snpeff_field: str = "ANN",
7934    ) -> None:
7935        """
7936        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
7937        annotation field in a VCF file and adds them as a new column in the variants table.
7938
7939        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
7940        function is used to specify the name of the column that will store the HGVS nomenclatures
7941        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
7942        snpeff_hgvs
7943        :type snpeff_hgvs: str (optional)
7944        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
7945        function represents the field in the VCF file that contains SnpEff annotations. This field is
7946        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
7947        to ANN
7948        :type snpeff_field: str (optional)
7949        """
7950
7951        # Snpeff hgvs tags
7952        vcf_infos_tags = {
7953            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
7954        }
7955
7956        # Prefix
7957        prefix = self.get_explode_infos_prefix()
7958        if prefix:
7959            prefix = "INFO/"
7960
7961        # snpEff fields
7962        speff_ann_infos = prefix + snpeff_field
7963        speff_hgvs_infos = prefix + snpeff_hgvs
7964
7965        # Variants table
7966        table_variants = self.get_table_variants()
7967
7968        # Header
7969        vcf_reader = self.get_header()
7970
7971        # Add columns
7972        added_columns = []
7973
7974        # Explode HGVS field in column
7975        added_columns += self.explode_infos(fields=[snpeff_field])
7976
7977        if snpeff_field in vcf_reader.infos:
7978
7979            log.debug(vcf_reader.infos[snpeff_field])
7980
7981            # Extract ANN header
7982            ann_description = vcf_reader.infos[snpeff_field].desc
7983            pattern = r"'(.+?)'"
7984            match = re.search(pattern, ann_description)
7985            if match:
7986                ann_header_match = match.group(1).split(" | ")
7987                ann_header_desc = {}
7988                for i in range(len(ann_header_match)):
7989                    ann_header_info = "".join(
7990                        char for char in ann_header_match[i] if char.isalnum()
7991                    )
7992                    ann_header_desc[ann_header_info] = ann_header_match[i]
7993                if not ann_header_desc:
7994                    raise ValueError("Invalid header description format")
7995            else:
7996                raise ValueError("Invalid header description format")
7997
7998            # Create variant id
7999            variant_id_column = self.get_variant_id_column()
8000            added_columns += [variant_id_column]
8001
8002            # Create dataframe
8003            dataframe_snpeff_hgvs = self.get_query_to_df(
8004                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8005            )
8006
8007            # Create main NOMEN column
8008            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8009                speff_ann_infos
8010            ].apply(
8011                lambda x: extract_snpeff_hgvs(
8012                    str(x), header=list(ann_header_desc.values())
8013                )
8014            )
8015
8016            # Add snpeff_hgvs to header
8017            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
8018                snpeff_hgvs,
8019                ".",
8020                "String",
8021                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
8022                "howard calculation",
8023                "0",
8024                self.code_type_map.get("String"),
8025            )
8026
8027            # Update
8028            sql_update = f"""
8029                UPDATE variants
8030                SET "INFO" = 
8031                    concat(
8032                        CASE
8033                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8034                            THEN ''
8035                            ELSE concat("INFO", ';')
8036                        END,
8037                        CASE 
8038                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8039                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8040                            THEN concat(
8041                                    '{snpeff_hgvs}=',
8042                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8043                                )
8044                            ELSE ''
8045                        END
8046                    )
8047                FROM dataframe_snpeff_hgvs
8048                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8049
8050            """
8051            self.conn.execute(sql_update)
8052
8053            # Delete dataframe
8054            del dataframe_snpeff_hgvs
8055            gc.collect()
8056
8057        else:
8058
8059            log.warning(
8060                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8061            )
8062
8063        # Remove added columns
8064        for added_column in added_columns:
8065            self.drop_column(column=added_column)

The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff annotation field in a VCF file and adds them as a new column in the variants table.

Parameters
  • snpeff_hgvs: The snpeff_hgvs parameter in the calculation_extract_snpeff_hgvs function is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs
  • snpeff_field: The snpeff_field parameter in the calculation_extract_snpeff_hgvs function represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
def calculation_snpeff_ann_explode( self, uniquify: bool = True, output_format: str = 'fields', output_prefix: str = 'snpeff_', snpeff_field: str = 'ANN') -> None:
8067    def calculation_snpeff_ann_explode(
8068        self,
8069        uniquify: bool = True,
8070        output_format: str = "fields",
8071        output_prefix: str = "snpeff_",
8072        snpeff_field: str = "ANN",
8073    ) -> None:
8074        """
8075        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
8076        exploding the HGVS field and updating variant information accordingly.
8077
8078        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
8079        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
8080        it indicates that the output should be unique, meaning that duplicate entries should be removed,
8081        defaults to True
8082        :type uniquify: bool (optional)
8083        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
8084        function specifies the format in which the output annotations will be generated. It has a
8085        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
8086        format, defaults to fields
8087        :type output_format: str (optional)
8088        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
8089        method is used to specify the prefix that will be added to the output annotations generated
8090        during the calculation process. This prefix helps to differentiate the newly added annotations
8091        from existing ones in the output data. By default, the, defaults to ANN_
8092        :type output_prefix: str (optional)
8093        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
8094        function is used to specify the field in the VCF file that contains SnpEff annotations. This
8095        field will be processed to explode the HGVS annotations and update the variant information
8096        accordingly, defaults to ANN
8097        :type snpeff_field: str (optional)
8098        """
8099
8100        # SnpEff annotation field
8101        snpeff_hgvs = "snpeff_ann_explode"
8102
8103        # Snpeff hgvs tags
8104        vcf_infos_tags = {
8105            snpeff_hgvs: "Explode snpEff annotations",
8106        }
8107
8108        # Prefix
8109        prefix = self.get_explode_infos_prefix()
8110        if prefix:
8111            prefix = "INFO/"
8112
8113        # snpEff fields
8114        speff_ann_infos = prefix + snpeff_field
8115        speff_hgvs_infos = prefix + snpeff_hgvs
8116
8117        # Variants table
8118        table_variants = self.get_table_variants()
8119
8120        # Header
8121        vcf_reader = self.get_header()
8122
8123        # Add columns
8124        added_columns = []
8125
8126        # Explode HGVS field in column
8127        added_columns += self.explode_infos(fields=[snpeff_field])
8128        log.debug(f"snpeff_field={snpeff_field}")
8129        log.debug(f"added_columns={added_columns}")
8130
8131        if snpeff_field in vcf_reader.infos:
8132
8133            # Extract ANN header
8134            ann_description = vcf_reader.infos[snpeff_field].desc
8135            pattern = r"'(.+?)'"
8136            match = re.search(pattern, ann_description)
8137            if match:
8138                ann_header_match = match.group(1).split(" | ")
8139                ann_header = []
8140                ann_header_desc = {}
8141                for i in range(len(ann_header_match)):
8142                    ann_header_info = "".join(
8143                        char for char in ann_header_match[i] if char.isalnum()
8144                    )
8145                    ann_header.append(ann_header_info)
8146                    ann_header_desc[ann_header_info] = ann_header_match[i]
8147                if not ann_header_desc:
8148                    raise ValueError("Invalid header description format")
8149            else:
8150                raise ValueError("Invalid header description format")
8151
8152            # Create variant id
8153            variant_id_column = self.get_variant_id_column()
8154            added_columns += [variant_id_column]
8155
8156            # Create dataframe
8157            dataframe_snpeff_hgvs = self.get_query_to_df(
8158                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8159            )
8160
8161            # Create snpEff columns
8162            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8163                speff_ann_infos
8164            ].apply(
8165                lambda x: explode_snpeff_ann(
8166                    str(x),
8167                    uniquify=uniquify,
8168                    output_format=output_format,
8169                    prefix=output_prefix,
8170                    header=list(ann_header_desc.values()),
8171                )
8172            )
8173
8174            # Header
8175            ann_annotations_prefix = ""
8176            if output_format.upper() in ["JSON"]:
8177                ann_annotations_prefix = f"{output_prefix}="
8178                vcf_reader.infos[output_prefix] = vcf.parser._Info(
8179                    output_prefix,
8180                    ".",
8181                    "String",
8182                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8183                    + " - JSON format",
8184                    "howard calculation",
8185                    "0",
8186                    self.code_type_map.get("String"),
8187                )
8188            else:
8189                for ann_annotation in ann_header:
8190                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
8191                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
8192                        ann_annotation_id,
8193                        ".",
8194                        "String",
8195                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8196                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
8197                        "howard calculation",
8198                        "0",
8199                        self.code_type_map.get("String"),
8200                    )
8201
8202            # Update
8203            sql_update = f"""
8204                UPDATE variants
8205                SET "INFO" = 
8206                    concat(
8207                        CASE
8208                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8209                            THEN ''
8210                            ELSE concat("INFO", ';')
8211                        END,
8212                        CASE 
8213                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8214                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8215                            THEN concat(
8216                                '{ann_annotations_prefix}',
8217                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8218                                )
8219                            ELSE ''
8220                        END
8221                    )
8222                FROM dataframe_snpeff_hgvs
8223                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8224
8225            """
8226            self.conn.execute(sql_update)
8227
8228            # Delete dataframe
8229            del dataframe_snpeff_hgvs
8230            gc.collect()
8231
8232        else:
8233
8234            log.warning(
8235                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8236            )
8237
8238        # Remove added columns
8239        for added_column in added_columns:
8240            self.drop_column(column=added_column)

The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by exploding the HGVS field and updating variant information accordingly.

Parameters
  • uniquify: The uniquify parameter in the calculation_snpeff_ann_explode method is a boolean flag that determines whether the output should be uniquified or not. When set to True, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True
  • output_format: The output_format parameter in the calculation_snpeff_ann_explode function specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields
  • output_prefix: The output_prefix parameter in the calculation_snpeff_ann_explode method is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_
  • snpeff_field: The snpeff_field parameter in the calculation_snpeff_ann_explode function is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
def calculation_extract_nomen(self) -> None:
8242    def calculation_extract_nomen(self) -> None:
8243        """
8244        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8245        """
8246
8247        # NOMEN field
8248        field_nomen_dict = "NOMEN_DICT"
8249
8250        # NOMEN structure
8251        nomen_dict = {
8252            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
8253            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
8254            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
8255            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
8256            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
8257            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
8258            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
8259            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
8260            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
8261            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
8262        }
8263
8264        # Param
8265        param = self.get_param()
8266
8267        # Prefix
8268        prefix = self.get_explode_infos_prefix()
8269
8270        # Header
8271        vcf_reader = self.get_header()
8272
8273        # Get HGVS field
8274        hgvs_field = (
8275            param.get("calculation", {})
8276            .get("calculations", {})
8277            .get("NOMEN", {})
8278            .get("options", {})
8279            .get("hgvs_field", "hgvs")
8280        )
8281
8282        # Get transcripts
8283        transcripts_file = (
8284            param.get("calculation", {})
8285            .get("calculations", {})
8286            .get("NOMEN", {})
8287            .get("options", {})
8288            .get("transcripts", None)
8289        )
8290        transcripts_file = full_path(transcripts_file)
8291        transcripts = []
8292        if transcripts_file:
8293            if os.path.exists(transcripts_file):
8294                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
8295                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
8296            else:
8297                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
8298                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
8299
8300        # Added columns
8301        added_columns = []
8302
8303        # Explode HGVS field in column
8304        added_columns += self.explode_infos(fields=[hgvs_field])
8305
8306        # extra infos
8307        extra_infos = self.get_extra_infos()
8308        extra_field = prefix + hgvs_field
8309
8310        if extra_field in extra_infos:
8311
8312            # Create dataframe
8313            dataframe_hgvs = self.get_query_to_df(
8314                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
8315            )
8316
8317            # Create main NOMEN column
8318            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
8319                lambda x: find_nomen(str(x), transcripts=transcripts)
8320            )
8321
8322            # Explode NOMEN Structure and create SQL set for update
8323            sql_nomen_fields = []
8324            for nomen_field in nomen_dict:
8325
8326                # Explode each field into a column
8327                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
8328                    lambda x: dict(x).get(nomen_field, "")
8329                )
8330
8331                # Create VCF header field
8332                vcf_reader.infos[nomen_field] = vcf.parser._Info(
8333                    nomen_field,
8334                    ".",
8335                    "String",
8336                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
8337                    "howard calculation",
8338                    "0",
8339                    self.code_type_map.get("String"),
8340                )
8341                sql_nomen_fields.append(
8342                    f"""
8343                        CASE 
8344                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
8345                            THEN concat(
8346                                    ';{nomen_field}=',
8347                                    dataframe_hgvs."{nomen_field}"
8348                                )
8349                            ELSE ''
8350                        END
8351                    """
8352                )
8353
8354            # SQL set for update
8355            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
8356
8357            # Update
8358            sql_update = f"""
8359                UPDATE variants
8360                SET "INFO" = 
8361                    concat(
8362                        CASE
8363                            WHEN "INFO" IS NULL
8364                            THEN ''
8365                            ELSE "INFO"
8366                        END,
8367                        {sql_nomen_fields_set}
8368                    )
8369                FROM dataframe_hgvs
8370                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
8371                    AND variants."POS" = dataframe_hgvs."POS" 
8372                    AND variants."REF" = dataframe_hgvs."REF"
8373                    AND variants."ALT" = dataframe_hgvs."ALT"
8374            """
8375            self.conn.execute(sql_update)
8376
8377            # Delete dataframe
8378            del dataframe_hgvs
8379            gc.collect()
8380
8381        # Remove added columns
8382        for added_column in added_columns:
8383            self.drop_column(column=added_column)

This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.

def calculation_find_by_pipeline(self, tag: str = 'findbypipeline') -> None:
8385    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
8386        """
8387        The function `calculation_find_by_pipeline` performs a calculation to find the number of
8388        pipeline/sample for a variant and updates the variant information in a VCF file.
8389
8390        :param tag: The `tag` parameter is a string that represents the annotation field for the
8391        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
8392        VCF header and to update the corresponding field in the variants table, defaults to
8393        findbypipeline
8394        :type tag: str (optional)
8395        """
8396
8397        # if FORMAT and samples
8398        if (
8399            "FORMAT" in self.get_header_columns_as_list()
8400            and self.get_header_sample_list()
8401        ):
8402
8403            # findbypipeline annotation field
8404            findbypipeline_tag = tag
8405
8406            # VCF infos tags
8407            vcf_infos_tags = {
8408                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
8409            }
8410
8411            # Prefix
8412            prefix = self.get_explode_infos_prefix()
8413
8414            # Field
8415            findbypipeline_infos = prefix + findbypipeline_tag
8416
8417            # Variants table
8418            table_variants = self.get_table_variants()
8419
8420            # Header
8421            vcf_reader = self.get_header()
8422
8423            # Create variant id
8424            variant_id_column = self.get_variant_id_column()
8425            added_columns = [variant_id_column]
8426
8427            # variant_id, FORMAT and samples
8428            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8429                self.get_header_sample_list()
8430            )
8431
8432            # Create dataframe
8433            dataframe_findbypipeline = self.get_query_to_df(
8434                f""" SELECT {samples_fields} FROM {table_variants} """
8435            )
8436
8437            # Create findbypipeline column
8438            dataframe_findbypipeline[findbypipeline_infos] = (
8439                dataframe_findbypipeline.apply(
8440                    lambda row: findbypipeline(
8441                        row, samples=self.get_header_sample_list()
8442                    ),
8443                    axis=1,
8444                )
8445            )
8446
8447            # Add snpeff_hgvs to header
8448            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
8449                findbypipeline_tag,
8450                ".",
8451                "String",
8452                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
8453                "howard calculation",
8454                "0",
8455                self.code_type_map.get("String"),
8456            )
8457
8458            # Update
8459            sql_update = f"""
8460                UPDATE variants
8461                SET "INFO" = 
8462                    concat(
8463                        CASE
8464                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8465                            THEN ''
8466                            ELSE concat("INFO", ';')
8467                        END,
8468                        CASE 
8469                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
8470                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
8471                            THEN concat(
8472                                    '{findbypipeline_tag}=',
8473                                    dataframe_findbypipeline."{findbypipeline_infos}"
8474                                )
8475                            ELSE ''
8476                        END
8477                    )
8478                FROM dataframe_findbypipeline
8479                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
8480            """
8481            self.conn.execute(sql_update)
8482
8483            # Remove added columns
8484            for added_column in added_columns:
8485                self.drop_column(column=added_column)
8486
8487            # Delete dataframe
8488            del dataframe_findbypipeline
8489            gc.collect()

The function calculation_find_by_pipeline performs a calculation to find the number of pipeline/sample for a variant and updates the variant information in a VCF file.

Parameters
  • tag: The tag parameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
def calculation_genotype_concordance(self) -> None:
8491    def calculation_genotype_concordance(self) -> None:
8492        """
8493        The function `calculation_genotype_concordance` calculates the genotype concordance for
8494        multi-caller VCF files and updates the variant information in the database.
8495        """
8496
8497        # if FORMAT and samples
8498        if (
8499            "FORMAT" in self.get_header_columns_as_list()
8500            and self.get_header_sample_list()
8501        ):
8502
8503            # genotypeconcordance annotation field
8504            genotypeconcordance_tag = "genotypeconcordance"
8505
8506            # VCF infos tags
8507            vcf_infos_tags = {
8508                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
8509            }
8510
8511            # Prefix
8512            prefix = self.get_explode_infos_prefix()
8513
8514            # Field
8515            genotypeconcordance_infos = prefix + genotypeconcordance_tag
8516
8517            # Variants table
8518            table_variants = self.get_table_variants()
8519
8520            # Header
8521            vcf_reader = self.get_header()
8522
8523            # Create variant id
8524            variant_id_column = self.get_variant_id_column()
8525            added_columns = [variant_id_column]
8526
8527            # variant_id, FORMAT and samples
8528            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8529                self.get_header_sample_list()
8530            )
8531
8532            # Create dataframe
8533            dataframe_genotypeconcordance = self.get_query_to_df(
8534                f""" SELECT {samples_fields} FROM {table_variants} """
8535            )
8536
8537            # Create genotypeconcordance column
8538            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
8539                dataframe_genotypeconcordance.apply(
8540                    lambda row: genotypeconcordance(
8541                        row, samples=self.get_header_sample_list()
8542                    ),
8543                    axis=1,
8544                )
8545            )
8546
8547            # Add genotypeconcordance to header
8548            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
8549                genotypeconcordance_tag,
8550                ".",
8551                "String",
8552                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
8553                "howard calculation",
8554                "0",
8555                self.code_type_map.get("String"),
8556            )
8557
8558            # Update
8559            sql_update = f"""
8560                UPDATE variants
8561                SET "INFO" = 
8562                    concat(
8563                        CASE
8564                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8565                            THEN ''
8566                            ELSE concat("INFO", ';')
8567                        END,
8568                        CASE
8569                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
8570                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
8571                            THEN concat(
8572                                    '{genotypeconcordance_tag}=',
8573                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
8574                                )
8575                            ELSE ''
8576                        END
8577                    )
8578                FROM dataframe_genotypeconcordance
8579                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
8580            """
8581            self.conn.execute(sql_update)
8582
8583            # Remove added columns
8584            for added_column in added_columns:
8585                self.drop_column(column=added_column)
8586
8587            # Delete dataframe
8588            del dataframe_genotypeconcordance
8589            gc.collect()

The function calculation_genotype_concordance calculates the genotype concordance for multi-caller VCF files and updates the variant information in the database.

def calculation_barcode(self, tag: str = 'barcode') -> None:
8591    def calculation_barcode(self, tag: str = "barcode") -> None:
8592        """
8593        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
8594        updates the INFO field in the file with the calculated barcode values.
8595
8596        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
8597        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
8598        the default tag name is set to "barcode", defaults to barcode
8599        :type tag: str (optional)
8600        """
8601
8602        # if FORMAT and samples
8603        if (
8604            "FORMAT" in self.get_header_columns_as_list()
8605            and self.get_header_sample_list()
8606        ):
8607
8608            # barcode annotation field
8609            if not tag:
8610                tag = "barcode"
8611
8612            # VCF infos tags
8613            vcf_infos_tags = {
8614                tag: "barcode calculation (VaRank)",
8615            }
8616
8617            # Prefix
8618            prefix = self.get_explode_infos_prefix()
8619
8620            # Field
8621            barcode_infos = prefix + tag
8622
8623            # Variants table
8624            table_variants = self.get_table_variants()
8625
8626            # Header
8627            vcf_reader = self.get_header()
8628
8629            # Create variant id
8630            variant_id_column = self.get_variant_id_column()
8631            added_columns = [variant_id_column]
8632
8633            # variant_id, FORMAT and samples
8634            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8635                self.get_header_sample_list()
8636            )
8637
8638            # Create dataframe
8639            dataframe_barcode = self.get_query_to_df(
8640                f""" SELECT {samples_fields} FROM {table_variants} """
8641            )
8642
8643            # Create barcode column
8644            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8645                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
8646            )
8647
8648            # Add barcode to header
8649            vcf_reader.infos[tag] = vcf.parser._Info(
8650                tag,
8651                ".",
8652                "String",
8653                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
8654                "howard calculation",
8655                "0",
8656                self.code_type_map.get("String"),
8657            )
8658
8659            # Update
8660            sql_update = f"""
8661                UPDATE {table_variants}
8662                SET "INFO" = 
8663                    concat(
8664                        CASE
8665                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8666                            THEN ''
8667                            ELSE concat("INFO", ';')
8668                        END,
8669                        CASE
8670                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
8671                            AND dataframe_barcode."{barcode_infos}" NOT NULL
8672                            THEN concat(
8673                                    '{tag}=',
8674                                    dataframe_barcode."{barcode_infos}"
8675                                )
8676                            ELSE ''
8677                        END
8678                    )
8679                FROM dataframe_barcode
8680                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8681            """
8682            self.conn.execute(sql_update)
8683
8684            # Remove added columns
8685            for added_column in added_columns:
8686                self.drop_column(column=added_column)
8687
8688            # Delete dataframe
8689            del dataframe_barcode
8690            gc.collect()

The calculation_barcode function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode function is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
def calculation_barcode_family(self, tag: str = 'BCF') -> None:
8692    def calculation_barcode_family(self, tag: str = "BCF") -> None:
8693        """
8694        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
8695        and updates the INFO field in the file with the calculated barcode values.
8696
8697        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
8698        the barcode tag that will be added to the VCF file during the calculation process. If no value
8699        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
8700        :type tag: str (optional)
8701        """
8702
8703        # if FORMAT and samples
8704        if (
8705            "FORMAT" in self.get_header_columns_as_list()
8706            and self.get_header_sample_list()
8707        ):
8708
8709            # barcode annotation field
8710            if not tag:
8711                tag = "BCF"
8712
8713            # VCF infos tags
8714            vcf_infos_tags = {
8715                tag: "barcode family calculation",
8716                f"{tag}S": "barcode family samples",
8717            }
8718
8719            # Param
8720            param = self.get_param()
8721            log.debug(f"param={param}")
8722
8723            # Prefix
8724            prefix = self.get_explode_infos_prefix()
8725
8726            # PED param
8727            ped = (
8728                param.get("calculation", {})
8729                .get("calculations", {})
8730                .get("BARCODEFAMILY", {})
8731                .get("family_pedigree", None)
8732            )
8733            log.debug(f"ped={ped}")
8734
8735            # Load PED
8736            if ped:
8737
8738                # Pedigree is a file
8739                if isinstance(ped, str) and os.path.exists(full_path(ped)):
8740                    log.debug("Pedigree is file")
8741                    with open(full_path(ped)) as ped:
8742                        ped = json.load(ped)
8743
8744                # Pedigree is a string
8745                elif isinstance(ped, str):
8746                    log.debug("Pedigree is str")
8747                    try:
8748                        ped = json.loads(ped)
8749                        log.debug("Pedigree is json str")
8750                    except ValueError as e:
8751                        ped_samples = ped.split(",")
8752                        ped = {}
8753                        for ped_sample in ped_samples:
8754                            ped[ped_sample] = ped_sample
8755
8756                # Pedigree is a dict
8757                elif isinstance(ped, dict):
8758                    log.debug("Pedigree is dict")
8759
8760                # Pedigree is not well formatted
8761                else:
8762                    msg_error = "Pedigree not well formatted"
8763                    log.error(msg_error)
8764                    raise ValueError(msg_error)
8765
8766                # Construct list
8767                ped_samples = list(ped.values())
8768
8769            else:
8770                log.debug("Pedigree not defined. Take all samples")
8771                ped_samples = self.get_header_sample_list()
8772                ped = {}
8773                for ped_sample in ped_samples:
8774                    ped[ped_sample] = ped_sample
8775
8776            # Check pedigree
8777            if not ped or len(ped) == 0:
8778                msg_error = f"Error in pedigree: samples {ped_samples}"
8779                log.error(msg_error)
8780                raise ValueError(msg_error)
8781
8782            # Log
8783            log.info(
8784                "Calculation 'BARCODEFAMILY' - Samples: "
8785                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
8786            )
8787            log.debug(f"ped_samples={ped_samples}")
8788
8789            # Field
8790            barcode_infos = prefix + tag
8791
8792            # Variants table
8793            table_variants = self.get_table_variants()
8794
8795            # Header
8796            vcf_reader = self.get_header()
8797
8798            # Create variant id
8799            variant_id_column = self.get_variant_id_column()
8800            added_columns = [variant_id_column]
8801
8802            # variant_id, FORMAT and samples
8803            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8804                ped_samples
8805            )
8806
8807            # Create dataframe
8808            dataframe_barcode = self.get_query_to_df(
8809                f""" SELECT {samples_fields} FROM {table_variants} """
8810            )
8811
8812            # Create barcode column
8813            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8814                lambda row: barcode(row, samples=ped_samples), axis=1
8815            )
8816
8817            # Add barcode family to header
8818            # Add vaf_normalization to header
8819            vcf_reader.formats[tag] = vcf.parser._Format(
8820                id=tag,
8821                num=".",
8822                type="String",
8823                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
8824                type_code=self.code_type_map.get("String"),
8825            )
8826            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
8827                id=f"{tag}S",
8828                num=".",
8829                type="String",
8830                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
8831                type_code=self.code_type_map.get("String"),
8832            )
8833
8834            # Update
8835            # for sample in ped_samples:
8836            sql_update_set = []
8837            for sample in self.get_header_sample_list() + ["FORMAT"]:
8838                if sample in ped_samples:
8839                    value = f'dataframe_barcode."{barcode_infos}"'
8840                    value_samples = "'" + ",".join(ped_samples) + "'"
8841                elif sample == "FORMAT":
8842                    value = f"'{tag}'"
8843                    value_samples = f"'{tag}S'"
8844                else:
8845                    value = "'.'"
8846                    value_samples = "'.'"
8847                format_regex = r"[a-zA-Z0-9\s]"
8848                sql_update_set.append(
8849                    f"""
8850                        "{sample}" = 
8851                        concat(
8852                            CASE
8853                                WHEN {table_variants}."{sample}" = './.'
8854                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
8855                                ELSE {table_variants}."{sample}"
8856                            END,
8857                            ':',
8858                            {value},
8859                            ':',
8860                            {value_samples}
8861                        )
8862                    """
8863                )
8864
8865            sql_update_set_join = ", ".join(sql_update_set)
8866            sql_update = f"""
8867                UPDATE {table_variants}
8868                SET {sql_update_set_join}
8869                FROM dataframe_barcode
8870                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8871            """
8872            self.conn.execute(sql_update)
8873
8874            # Remove added columns
8875            for added_column in added_columns:
8876                self.drop_column(column=added_column)
8877
8878            # Delete dataframe
8879            del dataframe_barcode
8880            gc.collect()

The calculation_barcode_family function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode_family function is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for the tag parameter, the default value used is "BCF", defaults to BCF
def calculation_trio(self) -> None:
8882    def calculation_trio(self) -> None:
8883        """
8884        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
8885        information to the INFO field of each variant.
8886        """
8887
8888        # if FORMAT and samples
8889        if (
8890            "FORMAT" in self.get_header_columns_as_list()
8891            and self.get_header_sample_list()
8892        ):
8893
8894            # trio annotation field
8895            trio_tag = "trio"
8896
8897            # VCF infos tags
8898            vcf_infos_tags = {
8899                "trio": "trio calculation",
8900            }
8901
8902            # Param
8903            param = self.get_param()
8904
8905            # Prefix
8906            prefix = self.get_explode_infos_prefix()
8907
8908            # Trio param
8909            trio_ped = (
8910                param.get("calculation", {})
8911                .get("calculations", {})
8912                .get("TRIO", {})
8913                .get("trio_pedigree", None)
8914            )
8915
8916            # Load trio
8917            if trio_ped:
8918
8919                # Trio pedigree is a file
8920                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
8921                    log.debug("TRIO pedigree is file")
8922                    with open(full_path(trio_ped)) as trio_ped:
8923                        trio_ped = json.load(trio_ped)
8924
8925                # Trio pedigree is a string
8926                elif isinstance(trio_ped, str):
8927                    log.debug("TRIO pedigree is str")
8928                    try:
8929                        trio_ped = json.loads(trio_ped)
8930                        log.debug("TRIO pedigree is json str")
8931                    except ValueError as e:
8932                        trio_samples = trio_ped.split(",")
8933                        if len(trio_samples) == 3:
8934                            trio_ped = {
8935                                "father": trio_samples[0],
8936                                "mother": trio_samples[1],
8937                                "child": trio_samples[2],
8938                            }
8939                            log.debug("TRIO pedigree is list str")
8940                        else:
8941                            msg_error = "TRIO pedigree not well formatted"
8942                            log.error(msg_error)
8943                            raise ValueError(msg_error)
8944
8945                # Trio pedigree is a dict
8946                elif isinstance(trio_ped, dict):
8947                    log.debug("TRIO pedigree is dict")
8948
8949                # Trio pedigree is not well formatted
8950                else:
8951                    msg_error = "TRIO pedigree not well formatted"
8952                    log.error(msg_error)
8953                    raise ValueError(msg_error)
8954
8955                # Construct trio list
8956                trio_samples = [
8957                    trio_ped.get("father", ""),
8958                    trio_ped.get("mother", ""),
8959                    trio_ped.get("child", ""),
8960                ]
8961
8962            else:
8963                log.debug("TRIO pedigree not defined. Take the first 3 samples")
8964                samples_list = self.get_header_sample_list()
8965                if len(samples_list) >= 3:
8966                    trio_samples = self.get_header_sample_list()[0:3]
8967                    trio_ped = {
8968                        "father": trio_samples[0],
8969                        "mother": trio_samples[1],
8970                        "child": trio_samples[2],
8971                    }
8972                else:
8973                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
8974                    log.error(msg_error)
8975                    raise ValueError(msg_error)
8976
8977            # Check trio pedigree
8978            if not trio_ped or len(trio_ped) != 3:
8979                msg_error = f"Error in TRIO pedigree: {trio_ped}"
8980                log.error(msg_error)
8981                raise ValueError(msg_error)
8982
8983            # Log
8984            log.info(
8985                f"Calculation 'TRIO' - Samples: "
8986                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
8987            )
8988
8989            # Field
8990            trio_infos = prefix + trio_tag
8991
8992            # Variants table
8993            table_variants = self.get_table_variants()
8994
8995            # Header
8996            vcf_reader = self.get_header()
8997
8998            # Create variant id
8999            variant_id_column = self.get_variant_id_column()
9000            added_columns = [variant_id_column]
9001
9002            # variant_id, FORMAT and samples
9003            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9004                self.get_header_sample_list()
9005            )
9006
9007            # Create dataframe
9008            dataframe_trio = self.get_query_to_df(
9009                f""" SELECT {samples_fields} FROM {table_variants} """
9010            )
9011
9012            # Create trio column
9013            dataframe_trio[trio_infos] = dataframe_trio.apply(
9014                lambda row: trio(row, samples=trio_samples), axis=1
9015            )
9016
9017            # Add trio to header
9018            vcf_reader.infos[trio_tag] = vcf.parser._Info(
9019                trio_tag,
9020                ".",
9021                "String",
9022                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
9023                "howard calculation",
9024                "0",
9025                self.code_type_map.get("String"),
9026            )
9027
9028            # Update
9029            sql_update = f"""
9030                UPDATE {table_variants}
9031                SET "INFO" = 
9032                    concat(
9033                        CASE
9034                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9035                            THEN ''
9036                            ELSE concat("INFO", ';')
9037                        END,
9038                        CASE
9039                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
9040                             AND dataframe_trio."{trio_infos}" NOT NULL
9041                            THEN concat(
9042                                    '{trio_tag}=',
9043                                    dataframe_trio."{trio_infos}"
9044                                )
9045                            ELSE ''
9046                        END
9047                    )
9048                FROM dataframe_trio
9049                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
9050            """
9051            self.conn.execute(sql_update)
9052
9053            # Remove added columns
9054            for added_column in added_columns:
9055                self.drop_column(column=added_column)
9056
9057            # Delete dataframe
9058            del dataframe_trio
9059            gc.collect()

The calculation_trio function performs trio calculations on a VCF file by adding trio information to the INFO field of each variant.

def calculation_vaf_normalization(self) -> None:
9061    def calculation_vaf_normalization(self) -> None:
9062        """
9063        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
9064        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
9065        :return: The function does not return anything.
9066        """
9067
9068        # if FORMAT and samples
9069        if (
9070            "FORMAT" in self.get_header_columns_as_list()
9071            and self.get_header_sample_list()
9072        ):
9073
9074            # vaf_normalization annotation field
9075            vaf_normalization_tag = "VAF"
9076
9077            # VCF infos tags
9078            vcf_infos_tags = {
9079                "VAF": "VAF Variant Frequency",
9080            }
9081
9082            # Prefix
9083            prefix = self.get_explode_infos_prefix()
9084
9085            # Variants table
9086            table_variants = self.get_table_variants()
9087
9088            # Header
9089            vcf_reader = self.get_header()
9090
9091            # Do not calculate if VAF already exists
9092            if "VAF" in vcf_reader.formats:
9093                log.debug("VAF already on genotypes")
9094                return
9095
9096            # Create variant id
9097            variant_id_column = self.get_variant_id_column()
9098            added_columns = [variant_id_column]
9099
9100            # variant_id, FORMAT and samples
9101            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9102                f""" "{sample}" """ for sample in self.get_header_sample_list()
9103            )
9104
9105            # Create dataframe
9106            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
9107            log.debug(f"query={query}")
9108            dataframe_vaf_normalization = self.get_query_to_df(query=query)
9109
9110            vaf_normalization_set = []
9111
9112            # for each sample vaf_normalization
9113            for sample in self.get_header_sample_list():
9114                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
9115                    lambda row: vaf_normalization(row, sample=sample), axis=1
9116                )
9117                vaf_normalization_set.append(
9118                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
9119                )
9120
9121            # Add VAF to FORMAT
9122            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
9123                "FORMAT"
9124            ].apply(lambda x: str(x) + ":VAF")
9125            vaf_normalization_set.append(
9126                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
9127            )
9128
9129            # Add vaf_normalization to header
9130            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
9131                id=vaf_normalization_tag,
9132                num="1",
9133                type="Float",
9134                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
9135                type_code=self.code_type_map.get("Float"),
9136            )
9137
9138            # Create fields to add in INFO
9139            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
9140
9141            # Update
9142            sql_update = f"""
9143                UPDATE {table_variants}
9144                SET {sql_vaf_normalization_set}
9145                FROM dataframe_vaf_normalization
9146                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
9147
9148            """
9149            self.conn.execute(sql_update)
9150
9151            # Remove added columns
9152            for added_column in added_columns:
9153                self.drop_column(column=added_column)
9154
9155            # Delete dataframe
9156            del dataframe_vaf_normalization
9157            gc.collect()

The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency) normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.

Returns

The function does not return anything.

def calculation_genotype_stats(self, info: str = 'VAF') -> None:
9159    def calculation_genotype_stats(self, info: str = "VAF") -> None:
9160        """
9161        The `calculation_genotype_stats` function calculates genotype statistics for a given information
9162        field in a VCF file and updates the INFO column of the variants table with the calculated
9163        statistics.
9164
9165        :param info: The `info` parameter is a string that represents the type of information for which
9166        genotype statistics are calculated. It is used to generate various VCF info tags for the
9167        statistics, such as the number of occurrences, the list of values, the minimum value, the
9168        maximum value, the mean, the median, defaults to VAF
9169        :type info: str (optional)
9170        """
9171
9172        # if FORMAT and samples
9173        if (
9174            "FORMAT" in self.get_header_columns_as_list()
9175            and self.get_header_sample_list()
9176        ):
9177
9178            # vaf_stats annotation field
9179            vaf_stats_tag = info + "_stats"
9180
9181            # VCF infos tags
9182            vcf_infos_tags = {
9183                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
9184                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
9185                info + "_stats_min": f"genotype {info} Statistics - min {info}",
9186                info + "_stats_max": f"genotype {info} Statistics - max {info}",
9187                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
9188                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
9189                info
9190                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
9191            }
9192
9193            # Prefix
9194            prefix = self.get_explode_infos_prefix()
9195
9196            # Field
9197            vaf_stats_infos = prefix + vaf_stats_tag
9198
9199            # Variants table
9200            table_variants = self.get_table_variants()
9201
9202            # Header
9203            vcf_reader = self.get_header()
9204
9205            # Create variant id
9206            variant_id_column = self.get_variant_id_column()
9207            added_columns = [variant_id_column]
9208
9209            # variant_id, FORMAT and samples
9210            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9211                self.get_header_sample_list()
9212            )
9213
9214            # Create dataframe
9215            dataframe_vaf_stats = self.get_query_to_df(
9216                f""" SELECT {samples_fields} FROM {table_variants} """
9217            )
9218
9219            # Create vaf_stats column
9220            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
9221                lambda row: genotype_stats(
9222                    row, samples=self.get_header_sample_list(), info=info
9223                ),
9224                axis=1,
9225            )
9226
9227            # List of vcf tags
9228            sql_vaf_stats_fields = []
9229
9230            # Check all VAF stats infos
9231            for stat in vcf_infos_tags:
9232
9233                # Extract stats
9234                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
9235                    lambda x: dict(x).get(stat, "")
9236                )
9237
9238                # Add snpeff_hgvs to header
9239                vcf_reader.infos[stat] = vcf.parser._Info(
9240                    stat,
9241                    ".",
9242                    "String",
9243                    vcf_infos_tags.get(stat, "genotype statistics"),
9244                    "howard calculation",
9245                    "0",
9246                    self.code_type_map.get("String"),
9247                )
9248
9249                if len(sql_vaf_stats_fields):
9250                    sep = ";"
9251                else:
9252                    sep = ""
9253
9254                # Create fields to add in INFO
9255                sql_vaf_stats_fields.append(
9256                    f"""
9257                        CASE
9258                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
9259                            THEN concat(
9260                                    '{sep}{stat}=',
9261                                    dataframe_vaf_stats."{stat}"
9262                                )
9263                            ELSE ''
9264                        END
9265                    """
9266                )
9267
9268            # SQL set for update
9269            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
9270
9271            # Update
9272            sql_update = f"""
9273                UPDATE {table_variants}
9274                SET "INFO" = 
9275                    concat(
9276                        CASE
9277                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9278                            THEN ''
9279                            ELSE concat("INFO", ';')
9280                        END,
9281                        {sql_vaf_stats_fields_set}
9282                    )
9283                FROM dataframe_vaf_stats
9284                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
9285
9286            """
9287            self.conn.execute(sql_update)
9288
9289            # Remove added columns
9290            for added_column in added_columns:
9291                self.drop_column(column=added_column)
9292
9293            # Delete dataframe
9294            del dataframe_vaf_stats
9295            gc.collect()

The calculation_genotype_stats function calculates genotype statistics for a given information field in a VCF file and updates the INFO column of the variants table with the calculated statistics.

Parameters
  • info: The info parameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
def calculation_transcripts_json(self, info: str = 'transcripts_json') -> None:
9297    def calculation_transcripts_json(self, info: str = "transcripts_json") -> None:
9298        """
9299        The function `calculation_transcripts_json` creates a transcripts table and adds an info field
9300        to it if transcripts are available.
9301
9302        :param info: The `info` parameter in the `calculation_transcripts_json` method is a string
9303        parameter that specifies the information field to be used in the transcripts JSON. It has a
9304        default value of "transcripts_json" if no value is provided when calling the method, defaults to
9305        transcripts_json
9306        :type info: str (optional)
9307        """
9308
9309        # Create transcripts table
9310        transcripts_table = self.create_transcript_view()
9311
9312        # Add info field
9313        if transcripts_table:
9314            self.transcript_view_to_variants(
9315                transcripts_table=transcripts_table, transcripts_info_field=info
9316            )
9317        else:
9318            log.info("No Transcripts to process. Check param.json file configuration")

The function calculation_transcripts_json creates a transcripts table and adds an info field to it if transcripts are available.

Parameters
  • info: The info parameter in the calculation_transcripts_json method is a string parameter that specifies the information field to be used in the transcripts JSON. It has a default value of "transcripts_json" if no value is provided when calling the method, defaults to transcripts_json
def calculation_transcripts_prioritization(self) -> None:
9320    def calculation_transcripts_prioritization(self) -> None:
9321        """
9322        The function `calculation_transcripts_prioritization` creates a transcripts table and
9323        prioritizes transcripts based on certain criteria.
9324        """
9325
9326        # Create transcripts table
9327        transcripts_table = self.create_transcript_view()
9328
9329        # Add info field
9330        if transcripts_table:
9331            self.transcripts_prioritization(transcripts_table=transcripts_table)
9332        else:
9333            log.info("No Transcripts to process. Check param.json file configuration")

The function calculation_transcripts_prioritization creates a transcripts table and prioritizes transcripts based on certain criteria.

def transcripts_prioritization(self, transcripts_table: str = None, param: dict = {}) -> bool:
9339    def transcripts_prioritization(
9340        self, transcripts_table: str = None, param: dict = {}
9341    ) -> bool:
9342        """
9343        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
9344        and updates the variants table with the prioritized information.
9345
9346        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
9347        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
9348        This parameter is used to identify the table where the transcripts data is stored for the
9349        prioritization process
9350        :type transcripts_table: str
9351        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
9352        that contains various configuration settings for the prioritization process of transcripts. It
9353        is used to customize the behavior of the prioritization algorithm and includes settings such as
9354        the prefix for prioritization fields, default profiles, and other
9355        :type param: dict
9356        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
9357        transcripts prioritization process is successfully completed, and `False` if there are any
9358        issues or if no profile is defined for transcripts prioritization.
9359        """
9360
9361        log.debug("Start transcripts prioritization...")
9362
9363        # Param
9364        if not param:
9365            param = self.get_param()
9366
9367        # Variants table
9368        table_variants = self.get_table_variants()
9369        log.debug(f"transcripts_table={transcripts_table}")
9370        # Transcripts table
9371        if transcripts_table is None:
9372            log.debug(f"transcripts_table={transcripts_table}")
9373            transcripts_table = self.create_transcript_view(
9374                transcripts_table="transcripts", param=param
9375            )
9376            log.debug(f"transcripts_table={transcripts_table}")
9377        if transcripts_table is None:
9378            msg_err = "No Transcripts table availalble"
9379            log.error(msg_err)
9380            raise ValueError(msg_err)
9381
9382        # Get transcripts columns
9383        columns_as_list_query = f"""
9384            DESCRIBE {transcripts_table}
9385        """
9386        columns_as_list = list(
9387            self.get_query_to_df(columns_as_list_query)["column_name"]
9388        )
9389
9390        # Create INFO if not exists
9391        if "INFO" not in columns_as_list:
9392            query_add_info = f"""
9393                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
9394            """
9395            self.execute_query(query_add_info)
9396
9397        # Prioritization param and Force only PZ Score and Flag
9398        pz_param = param.get("transcripts", {}).get("prioritization", {})
9399        pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score"
9400        pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag"
9401        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
9402        pz_param["pzfields"] = [pz_fields_score, pz_fields_flag]
9403        pz_profile_default = (
9404            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
9405        )
9406
9407        # Exit if no profile
9408        if pz_profile_default is None:
9409            log.warning("No profile defined for transcripts prioritization")
9410            return False
9411
9412        # Prioritization
9413        prioritization_result = self.prioritization(
9414            table=transcripts_table,
9415            pz_param=param.get("transcripts", {}).get("prioritization", {}),
9416        )
9417        if not prioritization_result:
9418            log.warning("Transcripts prioritization not processed")
9419            return False
9420
9421        # Explode PZ fields
9422        self.explode_infos(
9423            table=transcripts_table,
9424            fields=param.get("transcripts", {})
9425            .get("prioritization", {})
9426            .get("pzfields", []),
9427        )
9428
9429        # Export Transcripts prioritization infos to variants table
9430        query_update = f"""
9431            WITH RankedTranscripts AS (
9432                SELECT
9433                    "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag},
9434                    ROW_NUMBER() OVER (
9435                        PARTITION BY "#CHROM", POS, REF, ALT
9436                        ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC
9437                    ) AS rn
9438                FROM
9439                    {transcripts_table}
9440            )
9441            UPDATE {table_variants}
9442                SET
9443                INFO = CONCAT(CASE
9444                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9445                            THEN ''
9446                            ELSE concat("INFO", ';')
9447                        END,
9448                        concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag})
9449                        )
9450            FROM
9451                RankedTranscripts
9452            WHERE
9453                rn = 1
9454                AND variants."#CHROM" = RankedTranscripts."#CHROM"
9455                AND variants."POS" = RankedTranscripts."POS"
9456                AND variants."REF" = RankedTranscripts."REF"
9457                AND variants."ALT" = RankedTranscripts."ALT"
9458                
9459        """
9460        self.execute_query(query=query_update)
9461
9462        # Add PZ Transcript in header
9463        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
9464            pz_fields_transcripts,
9465            ".",
9466            "String",
9467            f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}",
9468            "unknown",
9469            "unknown",
9470            code_type_map["String"],
9471        )
9472
9473        # Return
9474        return True

The transcripts_prioritization function prioritizes transcripts based on certain parameters and updates the variants table with the prioritized information.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process
  • param: The param parameter in the transcripts_prioritization method is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns

The function transcripts_prioritization returns a boolean value True if the transcripts prioritization process is successfully completed, and False if there are any issues or if no profile is defined for transcripts prioritization.

def create_transcript_view_from_columns_map( self, transcripts_table: str = 'transcripts', columns_maps: dict = {}, added_columns: list = [], temporary_tables: list = None, annotation_fields: list = None) -> tuple[list, list, list]:
9476    def create_transcript_view_from_columns_map(
9477        self,
9478        transcripts_table: str = "transcripts",
9479        columns_maps: dict = {},
9480        added_columns: list = [],
9481        temporary_tables: list = None,
9482        annotation_fields: list = None,
9483    ) -> tuple[list, list, list]:
9484        """
9485        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
9486        specified columns mapping for transcripts data.
9487
9488        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
9489        the table where the transcripts data is stored or will be stored in the database. This table
9490        typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores,
9491        predictions, etc. It defaults to "transcripts, defaults to transcripts
9492        :type transcripts_table: str (optional)
9493        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about
9494        how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list
9495        represents a mapping configuration for a specific set of columns. It typically includes details such
9496        as the main transcript column and additional information columns
9497        :type columns_maps: dict
9498        :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map`
9499        function is a list that stores the additional columns that will be added to the view being created
9500        based on the columns map provided. These columns are generated by exploding the transcript
9501        information columns along with the main transcript column
9502        :type added_columns: list
9503        :param temporary_tables: The `temporary_tables` parameter in the
9504        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
9505        tables created during the process of creating a transcript view from a columns map. These temporary
9506        tables are used to store intermediate results or transformations before the final view is generated
9507        :type temporary_tables: list
9508        :param annotation_fields: The `annotation_fields` parameter in the
9509        `create_transcript_view_from_columns_map` function is a list that stores the fields that are used
9510        for annotation in the query view creation process. These fields are extracted from the
9511        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
9512        :type annotation_fields: list
9513        :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three
9514        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
9515        """
9516
9517        log.debug("Start transcrpts view creation from columns map...")
9518
9519        # "from_columns_map": [
9520        #     {
9521        #         "transcripts_column": "Ensembl_transcriptid",
9522        #         "transcripts_infos_columns": [
9523        #             "genename",
9524        #             "Ensembl_geneid",
9525        #             "LIST_S2_score",
9526        #             "LIST_S2_pred",
9527        #         ],
9528        #     },
9529        #     {
9530        #         "transcripts_column": "Ensembl_transcriptid",
9531        #         "transcripts_infos_columns": [
9532        #             "genename",
9533        #             "VARITY_R_score",
9534        #             "Aloft_pred",
9535        #         ],
9536        #     },
9537        # ],
9538
9539        # Init
9540        if temporary_tables is None:
9541            temporary_tables = []
9542        if annotation_fields is None:
9543            annotation_fields = []
9544
9545        # Variants table
9546        table_variants = self.get_table_variants()
9547
9548        for columns_map in columns_maps:
9549
9550            # Transcript column
9551            transcripts_column = columns_map.get("transcripts_column", None)
9552
9553            # Transcripts infos columns
9554            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
9555
9556            if transcripts_column is not None:
9557
9558                # Explode
9559                added_columns += self.explode_infos(
9560                    fields=[transcripts_column] + transcripts_infos_columns
9561                )
9562
9563                # View clauses
9564                clause_select = []
9565                for field in [transcripts_column] + transcripts_infos_columns:
9566                    clause_select.append(
9567                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
9568                    )
9569                    if field not in [transcripts_column]:
9570                        annotation_fields.append(field)
9571
9572                # Querey View
9573                query = f""" 
9574                    SELECT
9575                        "#CHROM", POS, REF, ALT,
9576                        "{transcripts_column}" AS 'transcript',
9577                        {", ".join(clause_select)}
9578                    FROM (
9579                        SELECT 
9580                            "#CHROM", POS, REF, ALT,
9581                            {", ".join(clause_select)}
9582                        FROM {table_variants}
9583                        )
9584                    WHERE "{transcripts_column}" IS NOT NULL
9585                """
9586
9587                # Create temporary table
9588                temporary_table = transcripts_table + "".join(
9589                    random.choices(string.ascii_uppercase + string.digits, k=10)
9590                )
9591
9592                # Temporary_tables
9593                temporary_tables.append(temporary_table)
9594                query_view = f"""
9595                    CREATE TEMPORARY TABLE {temporary_table}
9596                    AS ({query})
9597                """
9598                self.execute_query(query=query_view)
9599
9600        return added_columns, temporary_tables, annotation_fields

The create_transcript_view_from_columns_map function generates a temporary table view based on specified columns mapping for transcripts data.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
  • columns_maps: The columns_maps parameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in the columns_maps list represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns
  • added_columns: The added_columns parameter in the create_transcript_view_from_columns_map function is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_columns_map function is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_columns_map function is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from the transcripts_column and transcripts_infos_columns specified in the `columns
Returns

The function create_transcript_view_from_columns_map returns a tuple containing three lists: added_columns, temporary_tables, and annotation_fields.

def create_transcript_view_from_column_format( self, transcripts_table: str = 'transcripts', column_formats: dict = {}, temporary_tables: list = None, annotation_fields: list = None) -> tuple[list, list, list]:
9602    def create_transcript_view_from_column_format(
9603        self,
9604        transcripts_table: str = "transcripts",
9605        column_formats: dict = {},
9606        temporary_tables: list = None,
9607        annotation_fields: list = None,
9608    ) -> tuple[list, list, list]:
9609        """
9610        The `create_transcript_view_from_column_format` function generates a transcript view based on
9611        specified column formats, adds additional columns and annotation fields, and returns the list of
9612        temporary tables and annotation fields.
9613
9614        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
9615        the table containing the transcripts data. This table will be used as the base table for creating
9616        the transcript view. The default value for this parameter is "transcripts", but you can provide a
9617        different table name if needed, defaults to transcripts
9618        :type transcripts_table: str (optional)
9619        :param column_formats: The `column_formats` parameter is a dictionary that contains information
9620        about the columns to be used for creating the transcript view. Each entry in the dictionary
9621        specifies the mapping between a transcripts column and a transcripts infos column. For example, in
9622        the provided code snippet:
9623        :type column_formats: dict
9624        :param temporary_tables: The `temporary_tables` parameter in the
9625        `create_transcript_view_from_column_format` function is a list that stores the names of temporary
9626        views created during the process of creating a transcript view from a column format. These temporary
9627        views are used to manipulate and extract data before generating the final transcript view. It
9628        :type temporary_tables: list
9629        :param annotation_fields: The `annotation_fields` parameter in the
9630        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
9631        that are extracted from the temporary views created during the process. These annotation fields are
9632        obtained by querying the temporary views and extracting the column names excluding specific columns
9633        like `#CH
9634        :type annotation_fields: list
9635        :return: The `create_transcript_view_from_column_format` function returns two lists:
9636        `temporary_tables` and `annotation_fields`.
9637        """
9638
9639        log.debug("Start transcrpts view creation from column format...")
9640
9641        #  "from_column_format": [
9642        #     {
9643        #         "transcripts_column": "ANN",
9644        #         "transcripts_infos_column": "Feature_ID",
9645        #     }
9646        # ],
9647
9648        # Init
9649        if temporary_tables is None:
9650            temporary_tables = []
9651        if annotation_fields is None:
9652            annotation_fields = []
9653
9654        for column_format in column_formats:
9655
9656            # annotation field and transcript annotation field
9657            annotation_field = column_format.get("transcripts_column", "ANN")
9658            transcript_annotation = column_format.get(
9659                "transcripts_infos_column", "Feature_ID"
9660            )
9661
9662            # Temporary View name
9663            temporary_view_name = transcripts_table + "".join(
9664                random.choices(string.ascii_uppercase + string.digits, k=10)
9665            )
9666
9667            # Create temporary view name
9668            temporary_view_name = self.annotation_format_to_table(
9669                uniquify=True,
9670                annotation_field=annotation_field,
9671                view_name=temporary_view_name,
9672                annotation_id=transcript_annotation,
9673            )
9674
9675            # Annotation fields
9676            if temporary_view_name:
9677                query_annotation_fields = f"""
9678                    SELECT *
9679                    FROM (
9680                        DESCRIBE SELECT *
9681                        FROM {temporary_view_name}
9682                        )
9683                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
9684                """
9685                df_annotation_fields = self.get_query_to_df(
9686                    query=query_annotation_fields
9687                )
9688
9689                # Add temporary view and annotation fields
9690                temporary_tables.append(temporary_view_name)
9691                annotation_fields += list(set(df_annotation_fields["column_name"]))
9692
9693        return temporary_tables, annotation_fields

The create_transcript_view_from_column_format function generates a transcript view based on specified column formats, adds additional columns and annotation fields, and returns the list of temporary tables and annotation fields.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts
  • column_formats: The column_formats parameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. For example, in the provided code snippet:
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_column_format function is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view. It
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_column_format function is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
Returns

The create_transcript_view_from_column_format function returns two lists: temporary_tables and annotation_fields.

def create_transcript_view( self, transcripts_table: str = None, transcripts_table_drop: bool = True, param: dict = {}) -> str:
9695    def create_transcript_view(
9696        self,
9697        transcripts_table: str = None,
9698        transcripts_table_drop: bool = True,
9699        param: dict = {},
9700    ) -> str:
9701        """
9702        The `create_transcript_view` function generates a transcript view by processing data from a
9703        specified table based on provided parameters and structural information.
9704
9705        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
9706        is used to specify the name of the table that will store the final transcript view data. If a table
9707        name is not provided, the function will create a new table to store the transcript view data, and by
9708        default,, defaults to transcripts
9709        :type transcripts_table: str (optional)
9710        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
9711        `create_transcript_view` function is a boolean parameter that determines whether to drop the
9712        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
9713        the function will drop the existing transcripts table if it exists, defaults to True
9714        :type transcripts_table_drop: bool (optional)
9715        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
9716        contains information needed to create a transcript view. It includes details such as the structure
9717        of the transcripts, columns mapping, column formats, and other necessary information for generating
9718        the view. This parameter allows for flexibility and customization
9719        :type param: dict
9720        :return: The `create_transcript_view` function returns the name of the transcripts table that was
9721        created or modified during the execution of the function.
9722        """
9723
9724        log.debug("Start transcripts view creation...")
9725
9726        # Default
9727        transcripts_table_default = "transcripts"
9728
9729        # Param
9730        if not param:
9731            param = self.get_param()
9732
9733        # Struct
9734        struct = param.get("transcripts", {}).get("struct", None)
9735
9736        if struct:
9737
9738            # Transcripts table
9739            if transcripts_table is None:
9740                transcripts_table = param.get("transcripts", {}).get(
9741                    "table", transcripts_table_default
9742                )
9743
9744            # added_columns
9745            added_columns = []
9746
9747            # Temporary tables
9748            temporary_tables = []
9749
9750            # Annotation fields
9751            annotation_fields = []
9752
9753            # from columns map
9754            columns_maps = struct.get("from_columns_map", [])
9755            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
9756                self.create_transcript_view_from_columns_map(
9757                    transcripts_table=transcripts_table,
9758                    columns_maps=columns_maps,
9759                    added_columns=added_columns,
9760                    temporary_tables=temporary_tables,
9761                    annotation_fields=annotation_fields,
9762                )
9763            )
9764            added_columns += added_columns_tmp
9765            temporary_tables += temporary_tables_tmp
9766            annotation_fields += annotation_fields_tmp
9767
9768            # from column format
9769            column_formats = struct.get("from_column_format", [])
9770            temporary_tables_tmp, annotation_fields_tmp = (
9771                self.create_transcript_view_from_column_format(
9772                    transcripts_table=transcripts_table,
9773                    column_formats=column_formats,
9774                    temporary_tables=temporary_tables,
9775                    annotation_fields=annotation_fields,
9776                )
9777            )
9778            temporary_tables += temporary_tables_tmp
9779            annotation_fields += annotation_fields_tmp
9780
9781            # Merge temporary tables query
9782            query_merge = ""
9783            for temporary_table in temporary_tables:
9784
9785                # First temporary table
9786                if not query_merge:
9787                    query_merge = f"""
9788                        SELECT * FROM {temporary_table}
9789                    """
9790                # other temporary table (using UNION)
9791                else:
9792                    query_merge += f"""
9793                        UNION BY NAME SELECT * FROM {temporary_table}
9794                    """
9795
9796            # Merge on transcript
9797            query_merge_on_transcripts_annotation_fields = []
9798            # Aggregate all annotations fields
9799            for annotation_field in set(annotation_fields):
9800                query_merge_on_transcripts_annotation_fields.append(
9801                    f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
9802                )
9803            # Query for transcripts view
9804            query_merge_on_transcripts = f"""
9805                SELECT "#CHROM", POS, REF, ALT, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
9806                FROM ({query_merge})
9807                GROUP BY "#CHROM", POS, REF, ALT, transcript
9808            """
9809
9810            # Drop transcript view is necessary
9811            if transcripts_table_drop:
9812                query_drop = f"""
9813                    DROP TABLE IF EXISTS {transcripts_table};
9814                """
9815                self.execute_query(query=query_drop)
9816
9817            # Merge and create transcript view
9818            query_create_view = f"""
9819                CREATE TABLE IF NOT EXISTS {transcripts_table}
9820                AS {query_merge_on_transcripts}
9821            """
9822            self.execute_query(query=query_create_view)
9823
9824            # Remove added columns
9825            for added_column in added_columns:
9826                self.drop_column(column=added_column)
9827
9828        else:
9829
9830            transcripts_table = None
9831
9832        return transcripts_table

The create_transcript_view function generates a transcript view by processing data from a specified table based on provided parameters and structural information.

Parameters
  • transcripts_table: The transcripts_table parameter in the create_transcript_view function is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts
  • transcripts_table_drop: The transcripts_table_drop parameter in the create_transcript_view function is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. If transcripts_table_drop is set to True, the function will drop the existing transcripts table if it exists, defaults to True
  • param: The param parameter in the create_transcript_view function is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns

The create_transcript_view function returns the name of the transcripts table that was created or modified during the execution of the function.

def annotation_format_to_table( self, uniquify: bool = True, annotation_field: str = 'ANN', annotation_id: str = 'Feature_ID', view_name: str = 'transcripts') -> str:
9834    def annotation_format_to_table(
9835        self,
9836        uniquify: bool = True,
9837        annotation_field: str = "ANN",
9838        annotation_id: str = "Feature_ID",
9839        view_name: str = "transcripts",
9840    ) -> str:
9841        """
9842        The function `annotation_format_to_table` converts annotation data from a VCF file into a structured
9843        table format.
9844
9845        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique
9846        values in the output or not. If set to `True`, the function will make sure that the output values
9847        are unique, defaults to True
9848        :type uniquify: bool (optional)
9849        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that
9850        contains the annotation information for each variant. This field is used to extract the annotation
9851        details for further processing in the function, defaults to ANN
9852        :type annotation_field: str (optional)
9853        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is
9854        used to specify the identifier for the annotation feature. This identifier will be used as a column
9855        name in the resulting table or view that is created based on the annotation data. It helps in
9856        uniquely identifying each annotation entry in the, defaults to Feature_ID
9857        :type annotation_id: str (optional)
9858        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to
9859        specify the name of the temporary table that will be created to store the transformed annotation
9860        data. This table will hold the extracted information from the annotation field in a structured
9861        format for further processing or analysis, defaults to transcripts
9862        :type view_name: str (optional)
9863        :return: The function `annotation_format_to_table` is returning the name of the view created, which
9864        is stored in the variable `view_name`.
9865        """
9866
9867        # Annotation field
9868        annotation_format = "annotation_explode"
9869
9870        # Transcript annotation
9871        annotation_id = "".join(char for char in annotation_id if char.isalnum())
9872
9873        # Prefix
9874        prefix = self.get_explode_infos_prefix()
9875        if prefix:
9876            prefix = "INFO/"
9877
9878        # Annotation fields
9879        annotation_infos = prefix + annotation_field
9880        annotation_format_infos = prefix + annotation_format
9881
9882        # Variants table
9883        table_variants = self.get_table_variants()
9884
9885        # Header
9886        vcf_reader = self.get_header()
9887
9888        # Add columns
9889        added_columns = []
9890
9891        # Explode HGVS field in column
9892        added_columns += self.explode_infos(fields=[annotation_field])
9893
9894        if annotation_field in vcf_reader.infos:
9895
9896            # Extract ANN header
9897            ann_description = vcf_reader.infos[annotation_field].desc
9898            pattern = r"'(.+?)'"
9899            match = re.search(pattern, ann_description)
9900            if match:
9901                ann_header_match = match.group(1).split(" | ")
9902                ann_header = []
9903                ann_header_desc = {}
9904                for i in range(len(ann_header_match)):
9905                    ann_header_info = "".join(
9906                        char for char in ann_header_match[i] if char.isalnum()
9907                    )
9908                    ann_header.append(ann_header_info)
9909                    ann_header_desc[ann_header_info] = ann_header_match[i]
9910                if not ann_header_desc:
9911                    raise ValueError("Invalid header description format")
9912            else:
9913                raise ValueError("Invalid header description format")
9914
9915            # Create variant id
9916            variant_id_column = self.get_variant_id_column()
9917            added_columns += [variant_id_column]
9918
9919            # Create dataframe
9920            dataframe_annotation_format = self.get_query_to_df(
9921                f""" SELECT "#CHROM", POS, REF, ALT, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
9922            )
9923
9924            # Create annotation columns
9925            dataframe_annotation_format[
9926                annotation_format_infos
9927            ] = dataframe_annotation_format[annotation_infos].apply(
9928                lambda x: explode_annotation_format(
9929                    annotation=str(x),
9930                    uniquify=uniquify,
9931                    output_format="JSON",
9932                    prefix="",
9933                    header=list(ann_header_desc.values()),
9934                )
9935            )
9936
9937            # Find keys
9938            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
9939            df_keys = self.get_query_to_df(query=query_json)
9940
9941            # Check keys
9942            query_json_key = []
9943            for _, row in df_keys.iterrows():
9944
9945                # Key
9946                key = row.iloc[0]
9947
9948                # key_clean
9949                key_clean = "".join(char for char in key if char.isalnum())
9950
9951                # Type
9952                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
9953
9954                # Get DataFrame from query
9955                df_json_type = self.get_query_to_df(query=query_json_type)
9956
9957                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
9958                with pd.option_context("future.no_silent_downcasting", True):
9959                    df_json_type.fillna(value="", inplace=True)
9960                    replace_dict = {None: np.nan, "": np.nan}
9961                    df_json_type.replace(replace_dict, inplace=True)
9962                    df_json_type.dropna(inplace=True)
9963
9964                # Detect column type
9965                column_type = detect_column_type(df_json_type[key_clean])
9966
9967                # Append
9968                query_json_key.append(
9969                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
9970                )
9971
9972            # Create view
9973            query_view = f"""CREATE TEMPORARY TABLE {view_name} AS (SELECT *, {annotation_id} AS 'transcript' FROM (SELECT "#CHROM", POS, REF, ALT, {",".join(query_json_key)} FROM dataframe_annotation_format));"""
9974            self.execute_query(query=query_view)
9975
9976        else:
9977
9978            # Return None
9979            view_name = None
9980
9981        # Remove added columns
9982        for added_column in added_columns:
9983            self.drop_column(column=added_column)
9984
9985        return view_name

The function annotation_format_to_table converts annotation data from a VCF file into a structured table format.

Parameters
  • uniquify: The uniquify parameter is a boolean flag that determines whether to ensure unique values in the output or not. If set to True, the function will make sure that the output values are unique, defaults to True
  • annotation_field: The annotation_field parameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function, defaults to ANN
  • annotation_id: The annotation_id parameter in the annotation_format_to_table method is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
  • view_name: The view_name parameter in the annotation_format_to_table method is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis, defaults to transcripts
Returns

The function annotation_format_to_table is returning the name of the view created, which is stored in the variable view_name.

def transcript_view_to_variants( self, transcripts_table: str = None, transcripts_column_id: str = None, transcripts_info_json: str = None, transcripts_info_field: str = None, param: dict = {}) -> bool:
 9987    def transcript_view_to_variants(
 9988        self,
 9989        transcripts_table: str = None,
 9990        transcripts_column_id: str = None,
 9991        transcripts_info_json: str = None,
 9992        transcripts_info_field: str = None,
 9993        param: dict = {},
 9994    ) -> bool:
 9995        """
 9996        The function `transcript_view_to_variants` takes input parameters related to transcripts and updates
 9997        a variants table with information from the transcripts in JSON format.
 9998
 9999        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the table
10000        containing the transcripts data. If this parameter is not provided, the function will attempt to
10001        retrieve it from the `param` dictionary or use a default value of "transcripts"
10002        :type transcripts_table: str
10003        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the column in
10004        the `transcripts_table` that contains the unique identifier for each transcript. This identifier is
10005        used to match transcripts with variants in the database
10006        :type transcripts_column_id: str
10007        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name of
10008        the column in the variants table where the transcripts information will be stored in JSON format
10009        :type transcripts_info_json: str
10010        :param transcripts_info_field: The `transcripts_info_field` parameter is used to specify the field
10011        in the VCF header that will contain information about transcripts in JSON format. This field will be
10012        added to the VCF header as an INFO field with the specified name
10013        :type transcripts_info_field: str
10014        :param param: The `transcript_view_to_variants` method takes several parameters:
10015        :type param: dict
10016        :return: The function `transcript_view_to_variants` returns a boolean value, which is `True` if the
10017        operation is successful and `False` if certain conditions are not met.
10018        """
10019
10020        log.debug("Start transcripts view to JSON...")
10021
10022        # Default
10023        transcripts_table_default = "transcripts"
10024        transcripts_column_id_default = "transcript"
10025        transcripts_info_json_default = None
10026        transcripts_info_field_default = None
10027
10028        # Param
10029        if not param:
10030            param = self.get_param()
10031
10032        # Transcripts table
10033        if transcripts_table is None:
10034            transcripts_table = param.get("transcripts", {}).get(
10035                "table", transcripts_table_default
10036            )
10037
10038        # Transcripts column ID
10039        if transcripts_column_id is None:
10040            transcripts_column_id = param.get("transcripts", {}).get(
10041                "column_id", transcripts_column_id_default
10042            )
10043
10044        # Transcripts info field
10045        if transcripts_info_json is None:
10046            transcripts_info_json = param.get("transcripts", {}).get(
10047                "transcripts_info_json", transcripts_info_json_default
10048            )
10049
10050        # Transcripts info field
10051        if transcripts_info_field is None:
10052            transcripts_info_field = param.get("transcripts", {}).get(
10053                "transcripts_info_field", transcripts_info_field_default
10054            )
10055
10056        # Variants table
10057        table_variants = self.get_table_variants()
10058
10059        # Check info columns param
10060        if transcripts_info_json is None and transcripts_info_field is None:
10061            return False
10062
10063        # Transcripts infos columns
10064        query_transcripts_infos_columns = f"""
10065            SELECT *
10066            FROM (
10067                DESCRIBE SELECT * FROM {transcripts_table}
10068                )
10069            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
10070        """
10071        transcripts_infos_columns = list(
10072            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
10073        )
10074
10075        # View results
10076        clause_select = []
10077        clause_to_json = []
10078        for field in transcripts_infos_columns:
10079            clause_select.append(
10080                f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10081            )
10082            clause_to_json.append(f""" '{field}': "{field}" """)
10083
10084        # Update
10085        update_set = []
10086
10087        # VCF header
10088        vcf_reader = self.get_header()
10089
10090        # Transcripts to info column in JSON
10091        if transcripts_info_json is not None:
10092
10093            # Create column on variants table
10094            self.add_column(
10095                table_name=table_variants,
10096                column_name=transcripts_info_json,
10097                column_type="JSON",
10098                default_value=None,
10099                drop=False,
10100            )
10101
10102            # Add to update
10103            update_set.append(
10104                f""" {transcripts_info_json}=t.{transcripts_info_json} """
10105            )
10106
10107            # Add header
10108            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
10109                transcripts_info_json,
10110                ".",
10111                "String",
10112                "Transcripts in JSON format",
10113                "unknwon",
10114                "unknwon",
10115                self.code_type_map["String"],
10116            )
10117
10118        # Transcripts to info field in JSON
10119        if transcripts_info_field is not None:
10120
10121            # Add to update
10122            update_set.append(
10123                f""" 
10124                    INFO = concat(
10125                            CASE
10126                                WHEN INFO NOT IN ('', '.')
10127                                THEN INFO
10128                                ELSE ''
10129                            END,
10130                            CASE
10131                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
10132                                THEN concat(
10133                                    ';{transcripts_info_field}=',
10134                                    t.{transcripts_info_json}
10135                                )
10136                                ELSE ''
10137                            END
10138                            )
10139                """
10140            )
10141
10142            # Add header
10143            vcf_reader.infos[transcripts_info_field] = vcf.parser._Info(
10144                transcripts_info_field,
10145                ".",
10146                "String",
10147                "Transcripts in JSON format",
10148                "unknwon",
10149                "unknwon",
10150                self.code_type_map["String"],
10151            )
10152
10153        # Update query
10154        query_update = f"""
10155            UPDATE {table_variants}
10156                SET {", ".join(update_set)}
10157            FROM
10158            (
10159                SELECT
10160                    "#CHROM", POS, REF, ALT,
10161                        concat(
10162                        '{{',
10163                        string_agg(
10164                            '"' || "{transcripts_column_id}" || '":' ||
10165                            to_json(json_output)
10166                        ),
10167                        '}}'
10168                        )::JSON AS {transcripts_info_json}
10169                FROM
10170                    (
10171                    SELECT
10172                        "#CHROM", POS, REF, ALT,
10173                        "{transcripts_column_id}",
10174                        to_json(
10175                            {{{",".join(clause_to_json)}}}
10176                        )::JSON AS json_output
10177                    FROM
10178                        (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10179                    WHERE "{transcripts_column_id}" IS NOT NULL
10180                    )
10181                GROUP BY "#CHROM", POS, REF, ALT
10182            ) AS t
10183            WHERE {table_variants}."#CHROM" = t."#CHROM"
10184                AND {table_variants}."POS" = t."POS"
10185                AND {table_variants}."REF" = t."REF"
10186                AND {table_variants}."ALT" = t."ALT"
10187        """
10188
10189        self.execute_query(query=query_update)
10190
10191        return True

The function transcript_view_to_variants takes input parameters related to transcripts and updates a variants table with information from the transcripts in JSON format.

Parameters
  • transcripts_table: The transcripts_table parameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from the param dictionary or use a default value of "transcripts"
  • transcripts_column_id: The transcripts_column_id parameter is used to specify the column in the transcripts_table that contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database
  • transcripts_info_json: The transcripts_info_json parameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format
  • transcripts_info_field: The transcripts_info_field parameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name
  • param: The transcript_view_to_variants method takes several parameters:
Returns

The function transcript_view_to_variants returns a boolean value, which is True if the operation is successful and False if certain conditions are not met.